Importing Libraries

library(readxl)
library(ggplot2)
library(magrittr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(e1071)
library(naivebayes)
## naivebayes 0.9.7 loaded
library(caret)
## Loading required package: lattice
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  3.0.1     ✓ purrr   0.3.4
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x randomForest::combine() masks dplyr::combine()
## x tidyr::extract()        masks magrittr::extract()
## x dplyr::filter()         masks stats::filter()
## x dplyr::lag()            masks stats::lag()
## x purrr::lift()           masks caret::lift()
## x randomForest::margin()  masks ggplot2::margin()
## x purrr::set_names()      masks magrittr::set_names()
library(caret)
library(SuperLearner)
## Loading required package: nnls
## Super Learner
## Version: 2.0-26
## Package created on 2019-10-27
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(class)
library(corrplot)
## corrplot 0.84 loaded
library(GoodmanKruskal)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(DMwR)
## Loading required package: grid
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(ROSE)
## Loaded ROSE 0.0-3

Data Importing

# Importing all the case study 2 data set from github 
raw_dataset="https://raw.githubusercontent.com/lijjumathew/MSDS-Doing-Datascience-Project2/master/Data/CaseStudy2-data.csv"
no_attrition_dataset="https://raw.githubusercontent.com/lijjumathew/MSDS-Doing-Datascience-Project2/master/Data/CaseStudy2CompSet%20No%20Attrition.csv"
no_salary_dataset="https://raw.githubusercontent.com/lijjumathew/MSDS-Doing-Datascience-Project2/master/Data/CaseStudy2CompSet%20No%20Salary.csv"
raw_data <- read.csv(raw_dataset, sep = ",", header = TRUE)
no_attrition_data <- read.csv(no_attrition_dataset, sep = ",", header = TRUE)
no_salary_data <- read.csv(no_salary_dataset, sep = ",", header = TRUE)

# Checking details of case study 2 dataset
dim(raw_data)
## [1] 870  36
names(raw_data)
##  [1] "ID"                       "Age"                     
##  [3] "Attrition"                "BusinessTravel"          
##  [5] "DailyRate"                "Department"              
##  [7] "DistanceFromHome"         "Education"               
##  [9] "EducationField"           "EmployeeCount"           
## [11] "EmployeeNumber"           "EnvironmentSatisfaction" 
## [13] "Gender"                   "HourlyRate"              
## [15] "JobInvolvement"           "JobLevel"                
## [17] "JobRole"                  "JobSatisfaction"         
## [19] "MaritalStatus"            "MonthlyIncome"           
## [21] "MonthlyRate"              "NumCompaniesWorked"      
## [23] "Over18"                   "OverTime"                
## [25] "PercentSalaryHike"        "PerformanceRating"       
## [27] "RelationshipSatisfaction" "StandardHours"           
## [29] "StockOptionLevel"         "TotalWorkingYears"       
## [31] "TrainingTimesLastYear"    "WorkLifeBalance"         
## [33] "YearsAtCompany"           "YearsInCurrentRole"      
## [35] "YearsSinceLastPromotion"  "YearsWithCurrManager"
head(raw_data)
##   ID Age Attrition    BusinessTravel DailyRate             Department
## 1  1  32        No     Travel_Rarely       117                  Sales
## 2  2  40        No     Travel_Rarely      1308 Research & Development
## 3  3  35        No Travel_Frequently       200 Research & Development
## 4  4  32        No     Travel_Rarely       801                  Sales
## 5  5  24        No Travel_Frequently       567 Research & Development
## 6  6  27        No Travel_Frequently       294 Research & Development
##   DistanceFromHome Education   EducationField EmployeeCount EmployeeNumber
## 1               13         4    Life Sciences             1            859
## 2               14         3          Medical             1           1128
## 3               18         2    Life Sciences             1           1412
## 4                1         4        Marketing             1           2016
## 5                2         1 Technical Degree             1           1646
## 6               10         2    Life Sciences             1            733
##   EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel
## 1                       2   Male         73              3        2
## 2                       3   Male         44              2        5
## 3                       3   Male         60              3        3
## 4                       3 Female         48              3        3
## 5                       1 Female         32              3        1
## 6                       4   Male         32              3        3
##                  JobRole JobSatisfaction MaritalStatus MonthlyIncome
## 1        Sales Executive               4      Divorced          4403
## 2      Research Director               3        Single         19626
## 3 Manufacturing Director               4        Single          9362
## 4        Sales Executive               4       Married         10422
## 5     Research Scientist               4        Single          3760
## 6 Manufacturing Director               1      Divorced          8793
##   MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike
## 1        9250                  2      Y       No                11
## 2       17544                  1      Y       No                14
## 3       19944                  2      Y       No                11
## 4       24032                  1      Y       No                19
## 5       17218                  1      Y      Yes                13
## 6        4809                  1      Y       No                21
##   PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
## 1                 3                        3            80                1
## 2                 3                        1            80                0
## 3                 3                        3            80                0
## 4                 3                        3            80                2
## 5                 3                        3            80                0
## 6                 4                        3            80                2
##   TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## 1                 8                     3               2              5
## 2                21                     2               4             20
## 3                10                     2               3              2
## 4                14                     3               3             14
## 5                 6                     2               3              6
## 6                 9                     4               2              9
##   YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## 1                  2                       0                    3
## 2                  7                       4                    9
## 3                  2                       2                    2
## 4                 10                       5                    7
## 5                  3                       1                    3
## 6                  7                       1                    7
#View(raw_data)

EDA - Summary Stats

# Get a basic idea of all the variables.
summary(raw_data)
##        ID             Age        Attrition           BusinessTravel
##  Min.   :  1.0   Min.   :18.00   No :730   Non-Travel       : 94   
##  1st Qu.:218.2   1st Qu.:30.00   Yes:140   Travel_Frequently:158   
##  Median :435.5   Median :35.00             Travel_Rarely    :618   
##  Mean   :435.5   Mean   :36.83                                     
##  3rd Qu.:652.8   3rd Qu.:43.00                                     
##  Max.   :870.0   Max.   :60.00                                     
##                                                                    
##    DailyRate                       Department  DistanceFromHome   Education    
##  Min.   : 103.0   Human Resources       : 35   Min.   : 1.000   Min.   :1.000  
##  1st Qu.: 472.5   Research & Development:562   1st Qu.: 2.000   1st Qu.:2.000  
##  Median : 817.5   Sales                 :273   Median : 7.000   Median :3.000  
##  Mean   : 815.2                                Mean   : 9.339   Mean   :2.901  
##  3rd Qu.:1165.8                                3rd Qu.:14.000   3rd Qu.:4.000  
##  Max.   :1499.0                                Max.   :29.000   Max.   :5.000  
##                                                                                
##           EducationField EmployeeCount EmployeeNumber   EnvironmentSatisfaction
##  Human Resources : 15    Min.   :1     Min.   :   1.0   Min.   :1.000          
##  Life Sciences   :358    1st Qu.:1     1st Qu.: 477.2   1st Qu.:2.000          
##  Marketing       :100    Median :1     Median :1039.0   Median :3.000          
##  Medical         :270    Mean   :1     Mean   :1029.8   Mean   :2.701          
##  Other           : 52    3rd Qu.:1     3rd Qu.:1561.5   3rd Qu.:4.000          
##  Technical Degree: 75    Max.   :1     Max.   :2064.0   Max.   :4.000          
##                                                                                
##     Gender      HourlyRate     JobInvolvement     JobLevel    
##  Female:354   Min.   : 30.00   Min.   :1.000   Min.   :1.000  
##  Male  :516   1st Qu.: 48.00   1st Qu.:2.000   1st Qu.:1.000  
##               Median : 66.00   Median :3.000   Median :2.000  
##               Mean   : 65.61   Mean   :2.723   Mean   :2.039  
##               3rd Qu.: 83.00   3rd Qu.:3.000   3rd Qu.:3.000  
##               Max.   :100.00   Max.   :4.000   Max.   :5.000  
##                                                               
##                       JobRole    JobSatisfaction  MaritalStatus MonthlyIncome  
##  Sales Executive          :200   Min.   :1.000   Divorced:191   Min.   : 1081  
##  Research Scientist       :172   1st Qu.:2.000   Married :410   1st Qu.: 2840  
##  Laboratory Technician    :153   Median :3.000   Single  :269   Median : 4946  
##  Manufacturing Director   : 87   Mean   :2.709                  Mean   : 6390  
##  Healthcare Representative: 76   3rd Qu.:4.000                  3rd Qu.: 8182  
##  Sales Representative     : 53   Max.   :4.000                  Max.   :19999  
##  (Other)                  :129                                                 
##   MonthlyRate    NumCompaniesWorked Over18  OverTime  PercentSalaryHike
##  Min.   : 2094   Min.   :0.000      Y:870   No :618   Min.   :11.0     
##  1st Qu.: 8092   1st Qu.:1.000              Yes:252   1st Qu.:12.0     
##  Median :14074   Median :2.000                        Median :14.0     
##  Mean   :14326   Mean   :2.728                        Mean   :15.2     
##  3rd Qu.:20456   3rd Qu.:4.000                        3rd Qu.:18.0     
##  Max.   :26997   Max.   :9.000                        Max.   :25.0     
##                                                                        
##  PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
##  Min.   :3.000     Min.   :1.000            Min.   :80    Min.   :0.0000  
##  1st Qu.:3.000     1st Qu.:2.000            1st Qu.:80    1st Qu.:0.0000  
##  Median :3.000     Median :3.000            Median :80    Median :1.0000  
##  Mean   :3.152     Mean   :2.707            Mean   :80    Mean   :0.7839  
##  3rd Qu.:3.000     3rd Qu.:4.000            3rd Qu.:80    3rd Qu.:1.0000  
##  Max.   :4.000     Max.   :4.000            Max.   :80    Max.   :3.0000  
##                                                                           
##  TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany  
##  Min.   : 0.00     Min.   :0.000         Min.   :1.000   Min.   : 0.000  
##  1st Qu.: 6.00     1st Qu.:2.000         1st Qu.:2.000   1st Qu.: 3.000  
##  Median :10.00     Median :3.000         Median :3.000   Median : 5.000  
##  Mean   :11.05     Mean   :2.832         Mean   :2.782   Mean   : 6.962  
##  3rd Qu.:15.00     3rd Qu.:3.000         3rd Qu.:3.000   3rd Qu.:10.000  
##  Max.   :40.00     Max.   :6.000         Max.   :4.000   Max.   :40.000  
##                                                                          
##  YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
##  Min.   : 0.000     Min.   : 0.000          Min.   : 0.00       
##  1st Qu.: 2.000     1st Qu.: 0.000          1st Qu.: 2.00       
##  Median : 3.000     Median : 1.000          Median : 3.00       
##  Mean   : 4.205     Mean   : 2.169          Mean   : 4.14       
##  3rd Qu.: 7.000     3rd Qu.: 3.000          3rd Qu.: 7.00       
##  Max.   :18.000     Max.   :15.000          Max.   :17.00       
## 
# This gives us a high level view of the case study 2 dataset. 
# We can see that we don't have any NULL values in our case study 2 dataset. 
# We can also see that we just have 140 attrition records out of 870 which is approximately 16%.

str(raw_data)
## 'data.frame':    870 obs. of  36 variables:
##  $ ID                      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                     : int  32 40 35 32 24 27 41 37 34 34 ...
##  $ Attrition               : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ BusinessTravel          : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 2 3 2 2 3 3 3 2 ...
##  $ DailyRate               : int  117 1308 200 801 567 294 1283 309 1333 653 ...
##  $ Department              : Factor w/ 3 levels "Human Resources",..: 3 2 2 3 2 2 2 3 3 2 ...
##  $ DistanceFromHome        : int  13 14 18 1 2 10 5 10 10 10 ...
##  $ Education               : int  4 3 2 4 1 2 5 4 4 4 ...
##  $ EducationField          : Factor w/ 6 levels "Human Resources",..: 2 4 2 3 6 2 4 2 2 6 ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  859 1128 1412 2016 1646 733 1448 1105 1055 1597 ...
##  $ EnvironmentSatisfaction : int  2 3 3 3 1 4 2 4 3 4 ...
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 2 2 2 1 1 2 2 1 1 2 ...
##  $ HourlyRate              : int  73 44 60 48 32 32 90 88 87 92 ...
##  $ JobInvolvement          : int  3 2 3 3 3 3 4 2 3 2 ...
##  $ JobLevel                : int  2 5 3 3 1 3 1 2 1 2 ...
##  $ JobRole                 : Factor w/ 9 levels "Healthcare Representative",..: 8 6 5 8 7 5 7 8 9 1 ...
##  $ JobSatisfaction         : int  4 3 4 4 4 1 3 4 3 3 ...
##  $ MaritalStatus           : Factor w/ 3 levels "Divorced","Married",..: 1 3 3 2 3 1 2 1 2 2 ...
##  $ MonthlyIncome           : int  4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
##  $ MonthlyRate             : int  9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
##  $ NumCompaniesWorked      : int  2 1 2 1 1 1 2 2 1 1 ...
##  $ Over18                  : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ OverTime                : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 2 2 2 1 ...
##  $ PercentSalaryHike       : int  11 14 11 19 13 21 12 14 19 14 ...
##  $ PerformanceRating       : int  3 3 3 3 3 4 3 3 3 3 ...
##  $ RelationshipSatisfaction: int  3 1 3 3 3 3 1 3 4 2 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  1 0 0 2 0 2 0 3 1 1 ...
##  $ TotalWorkingYears       : int  8 21 10 14 6 9 7 8 1 8 ...
##  $ TrainingTimesLastYear   : int  3 2 2 3 2 4 5 5 2 3 ...
##  $ WorkLifeBalance         : int  2 4 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : int  5 20 2 14 6 9 4 1 1 8 ...
##  $ YearsInCurrentRole      : int  2 7 2 10 3 7 2 0 1 2 ...
##  $ YearsSinceLastPromotion : int  0 4 2 5 1 1 0 0 0 7 ...
##  $ YearsWithCurrManager    : int  3 9 2 7 3 7 3 0 0 7 ...
summary(raw_data)
##        ID             Age        Attrition           BusinessTravel
##  Min.   :  1.0   Min.   :18.00   No :730   Non-Travel       : 94   
##  1st Qu.:218.2   1st Qu.:30.00   Yes:140   Travel_Frequently:158   
##  Median :435.5   Median :35.00             Travel_Rarely    :618   
##  Mean   :435.5   Mean   :36.83                                     
##  3rd Qu.:652.8   3rd Qu.:43.00                                     
##  Max.   :870.0   Max.   :60.00                                     
##                                                                    
##    DailyRate                       Department  DistanceFromHome   Education    
##  Min.   : 103.0   Human Resources       : 35   Min.   : 1.000   Min.   :1.000  
##  1st Qu.: 472.5   Research & Development:562   1st Qu.: 2.000   1st Qu.:2.000  
##  Median : 817.5   Sales                 :273   Median : 7.000   Median :3.000  
##  Mean   : 815.2                                Mean   : 9.339   Mean   :2.901  
##  3rd Qu.:1165.8                                3rd Qu.:14.000   3rd Qu.:4.000  
##  Max.   :1499.0                                Max.   :29.000   Max.   :5.000  
##                                                                                
##           EducationField EmployeeCount EmployeeNumber   EnvironmentSatisfaction
##  Human Resources : 15    Min.   :1     Min.   :   1.0   Min.   :1.000          
##  Life Sciences   :358    1st Qu.:1     1st Qu.: 477.2   1st Qu.:2.000          
##  Marketing       :100    Median :1     Median :1039.0   Median :3.000          
##  Medical         :270    Mean   :1     Mean   :1029.8   Mean   :2.701          
##  Other           : 52    3rd Qu.:1     3rd Qu.:1561.5   3rd Qu.:4.000          
##  Technical Degree: 75    Max.   :1     Max.   :2064.0   Max.   :4.000          
##                                                                                
##     Gender      HourlyRate     JobInvolvement     JobLevel    
##  Female:354   Min.   : 30.00   Min.   :1.000   Min.   :1.000  
##  Male  :516   1st Qu.: 48.00   1st Qu.:2.000   1st Qu.:1.000  
##               Median : 66.00   Median :3.000   Median :2.000  
##               Mean   : 65.61   Mean   :2.723   Mean   :2.039  
##               3rd Qu.: 83.00   3rd Qu.:3.000   3rd Qu.:3.000  
##               Max.   :100.00   Max.   :4.000   Max.   :5.000  
##                                                               
##                       JobRole    JobSatisfaction  MaritalStatus MonthlyIncome  
##  Sales Executive          :200   Min.   :1.000   Divorced:191   Min.   : 1081  
##  Research Scientist       :172   1st Qu.:2.000   Married :410   1st Qu.: 2840  
##  Laboratory Technician    :153   Median :3.000   Single  :269   Median : 4946  
##  Manufacturing Director   : 87   Mean   :2.709                  Mean   : 6390  
##  Healthcare Representative: 76   3rd Qu.:4.000                  3rd Qu.: 8182  
##  Sales Representative     : 53   Max.   :4.000                  Max.   :19999  
##  (Other)                  :129                                                 
##   MonthlyRate    NumCompaniesWorked Over18  OverTime  PercentSalaryHike
##  Min.   : 2094   Min.   :0.000      Y:870   No :618   Min.   :11.0     
##  1st Qu.: 8092   1st Qu.:1.000              Yes:252   1st Qu.:12.0     
##  Median :14074   Median :2.000                        Median :14.0     
##  Mean   :14326   Mean   :2.728                        Mean   :15.2     
##  3rd Qu.:20456   3rd Qu.:4.000                        3rd Qu.:18.0     
##  Max.   :26997   Max.   :9.000                        Max.   :25.0     
##                                                                        
##  PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
##  Min.   :3.000     Min.   :1.000            Min.   :80    Min.   :0.0000  
##  1st Qu.:3.000     1st Qu.:2.000            1st Qu.:80    1st Qu.:0.0000  
##  Median :3.000     Median :3.000            Median :80    Median :1.0000  
##  Mean   :3.152     Mean   :2.707            Mean   :80    Mean   :0.7839  
##  3rd Qu.:3.000     3rd Qu.:4.000            3rd Qu.:80    3rd Qu.:1.0000  
##  Max.   :4.000     Max.   :4.000            Max.   :80    Max.   :3.0000  
##                                                                           
##  TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany  
##  Min.   : 0.00     Min.   :0.000         Min.   :1.000   Min.   : 0.000  
##  1st Qu.: 6.00     1st Qu.:2.000         1st Qu.:2.000   1st Qu.: 3.000  
##  Median :10.00     Median :3.000         Median :3.000   Median : 5.000  
##  Mean   :11.05     Mean   :2.832         Mean   :2.782   Mean   : 6.962  
##  3rd Qu.:15.00     3rd Qu.:3.000         3rd Qu.:3.000   3rd Qu.:10.000  
##  Max.   :40.00     Max.   :6.000         Max.   :4.000   Max.   :40.000  
##                                                                          
##  YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
##  Min.   : 0.000     Min.   : 0.000          Min.   : 0.00       
##  1st Qu.: 2.000     1st Qu.: 0.000          1st Qu.: 2.00       
##  Median : 3.000     Median : 1.000          Median : 3.00       
##  Mean   : 4.205     Mean   : 2.169          Mean   : 4.14       
##  3rd Qu.: 7.000     3rd Qu.: 3.000          3rd Qu.: 7.00       
##  Max.   :18.000     Max.   :15.000          Max.   :17.00       
## 
str(no_attrition_data)
## 'data.frame':    300 obs. of  35 variables:
##  $ ID                      : int  1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 ...
##  $ Age                     : int  35 33 26 55 29 51 52 39 31 31 ...
##  $ BusinessTravel          : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 3 3 3 2 1 3 3 2 ...
##  $ DailyRate               : int  750 147 1330 1311 1246 1456 585 1387 1062 534 ...
##  $ Department              : Factor w/ 3 levels "Human Resources",..: 2 1 2 2 3 2 3 2 2 2 ...
##  $ DistanceFromHome        : int  28 2 21 2 19 1 29 10 24 20 ...
##  $ Education               : int  3 3 3 3 3 4 4 5 3 3 ...
##  $ EducationField          : Factor w/ 6 levels "Human Resources",..: 2 1 4 2 2 4 2 4 4 2 ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  1596 1207 1107 505 1497 145 2019 1618 1252 587 ...
##  $ EnvironmentSatisfaction : int  2 2 1 3 3 1 1 2 3 1 ...
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 2 2 2 1 2 1 2 2 1 2 ...
##  $ HourlyRate              : int  46 99 37 97 77 30 40 76 96 66 ...
##  $ JobInvolvement          : int  4 3 3 3 2 2 3 3 2 3 ...
##  $ JobLevel                : int  2 1 1 4 2 3 1 2 2 3 ...
##  $ JobRole                 : Factor w/ 9 levels "Healthcare Representative",..: 3 2 3 4 8 1 9 5 1 1 ...
##  $ JobSatisfaction         : int  3 3 3 4 3 1 4 1 1 3 ...
##  $ MaritalStatus           : Factor w/ 3 levels "Divorced","Married",..: 2 2 1 3 1 3 1 2 3 2 ...
##  $ MonthlyIncome           : int  3407 3600 2377 16659 8620 7484 3482 5377 6812 9824 ...
##  $ MonthlyRate             : int  25348 8429 19373 23258 23757 25796 19788 3835 17198 22908 ...
##  $ NumCompaniesWorked      : int  1 1 1 2 1 3 2 2 1 3 ...
##  $ Over18                  : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ OverTime                : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 1 1 1 ...
##  $ PercentSalaryHike       : int  17 13 20 13 14 20 15 13 19 12 ...
##  $ PerformanceRating       : int  3 3 4 3 3 4 3 3 3 3 ...
##  $ RelationshipSatisfaction: int  4 4 3 3 3 3 2 4 2 1 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  2 1 1 0 2 0 2 3 0 0 ...
##  $ TotalWorkingYears       : int  10 5 1 30 10 23 16 10 10 12 ...
##  $ TrainingTimesLastYear   : int  3 2 0 2 3 1 3 3 2 2 ...
##  $ WorkLifeBalance         : int  2 3 2 3 3 2 2 3 3 3 ...
##  $ YearsAtCompany          : int  10 5 1 5 10 13 9 7 10 1 ...
##  $ YearsInCurrentRole      : int  9 4 1 4 7 12 8 7 9 0 ...
##  $ YearsSinceLastPromotion : int  6 1 0 1 0 12 0 7 1 0 ...
##  $ YearsWithCurrManager    : int  8 4 0 2 4 8 0 7 8 0 ...
summary(no_attrition_data)
##        ID            Age                  BusinessTravel   DailyRate     
##  Min.   :1171   Min.   :19.00   Non-Travel       : 32    Min.   : 102.0  
##  1st Qu.:1246   1st Qu.:31.00   Travel_Frequently: 57    1st Qu.: 448.0  
##  Median :1320   Median :36.00   Travel_Rarely    :211    Median : 775.0  
##  Mean   :1320   Mean   :37.86                            Mean   : 784.8  
##  3rd Qu.:1395   3rd Qu.:44.00                            3rd Qu.:1117.0  
##  Max.   :1470   Max.   :60.00                            Max.   :1490.0  
##                                                                          
##                   Department  DistanceFromHome   Education    
##  Human Resources       : 11   Min.   : 1.00    Min.   :1.000  
##  Research & Development:209   1st Qu.: 2.00    1st Qu.:2.000  
##  Sales                 : 80   Median : 7.00    Median :3.000  
##                               Mean   : 9.26    Mean   :2.973  
##                               3rd Qu.:14.00    3rd Qu.:4.000  
##                               Max.   :29.00    Max.   :5.000  
##                                                               
##           EducationField EmployeeCount EmployeeNumber   EnvironmentSatisfaction
##  Human Resources :  7    Min.   :1     Min.   :   2.0   Min.   :1.000          
##  Life Sciences   :130    1st Qu.:1     1st Qu.: 508.8   1st Qu.:2.000          
##  Marketing       : 27    Median :1     Median : 994.5   Median :3.000          
##  Medical         : 94    Mean   :1     Mean   :1020.9   Mean   :2.733          
##  Other           : 12    3rd Qu.:1     3rd Qu.:1542.5   3rd Qu.:4.000          
##  Technical Degree: 30    Max.   :1     Max.   :2065.0   Max.   :4.000          
##                                                                                
##     Gender      HourlyRate     JobInvolvement     JobLevel  
##  Female:105   Min.   : 30.00   Min.   :1.000   Min.   :1.0  
##  Male  :195   1st Qu.: 50.00   1st Qu.:2.000   1st Qu.:1.0  
##               Median : 66.00   Median :3.000   Median :2.0  
##               Mean   : 66.07   Mean   :2.743   Mean   :2.2  
##               3rd Qu.: 83.00   3rd Qu.:3.000   3rd Qu.:3.0  
##               Max.   :100.00   Max.   :4.000   Max.   :5.0  
##                                                             
##                       JobRole   JobSatisfaction  MaritalStatus MonthlyIncome  
##  Research Scientist       :61   Min.   :1.000   Divorced: 65   Min.   : 1232  
##  Sales Executive          :57   1st Qu.:2.000   Married :128   1st Qu.: 3034  
##  Laboratory Technician    :55   Median :3.000   Single  :107   Median : 5208  
##  Manufacturing Director   :31   Mean   :2.767                  Mean   : 7103  
##  Manager                  :30   3rd Qu.:4.000                  3rd Qu.: 9750  
##  Healthcare Representative:29   Max.   :4.000                  Max.   :19973  
##  (Other)                  :37                                                 
##   MonthlyRate    NumCompaniesWorked Over18  OverTime  PercentSalaryHike
##  Min.   : 2097   Min.   :0.000      Y:300   No :212   Min.   :11.00    
##  1st Qu.: 8420   1st Qu.:1.000              Yes: 88   1st Qu.:12.00    
##  Median :15091   Median :2.000                        Median :14.00    
##  Mean   :14499   Mean   :2.547                        Mean   :15.17    
##  3rd Qu.:20330   3rd Qu.:4.000                        3rd Qu.:18.00    
##  Max.   :26914   Max.   :9.000                        Max.   :25.00    
##                                                                        
##  PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
##  Min.   :3.000     Min.   :1.000            Min.   :80    Min.   :0.0000  
##  1st Qu.:3.000     1st Qu.:2.000            1st Qu.:80    1st Qu.:0.0000  
##  Median :3.000     Median :3.000            Median :80    Median :1.0000  
##  Mean   :3.153     Mean   :2.803            Mean   :80    Mean   :0.7833  
##  3rd Qu.:3.000     3rd Qu.:4.000            3rd Qu.:80    3rd Qu.:1.0000  
##  Max.   :4.000     Max.   :4.000            Max.   :80    Max.   :3.0000  
##                                                                           
##  TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany  
##  Min.   : 0.00     Min.   :0.000         Min.   :1.000   Min.   : 0.000  
##  1st Qu.: 6.00     1st Qu.:2.000         1st Qu.:2.000   1st Qu.: 3.000  
##  Median :10.00     Median :2.000         Median :3.000   Median : 5.000  
##  Mean   :12.44     Mean   :2.683         Mean   :2.747   Mean   : 7.527  
##  3rd Qu.:18.00     3rd Qu.:3.000         3rd Qu.:3.000   3rd Qu.:10.000  
##  Max.   :38.00     Max.   :6.000         Max.   :4.000   Max.   :37.000  
##                                                                          
##  YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
##  Min.   : 0.00      Min.   : 0.00           Min.   : 0.00       
##  1st Qu.: 2.00      1st Qu.: 0.00           1st Qu.: 2.00       
##  Median : 3.00      Median : 1.00           Median : 3.00       
##  Mean   : 4.33      Mean   : 2.29           Mean   : 4.38       
##  3rd Qu.: 7.00      3rd Qu.: 3.00           3rd Qu.: 7.00       
##  Max.   :18.00      Max.   :15.00           Max.   :17.00       
## 
summary(no_salary_data)
##        ID              Age        Attrition           BusinessTravel
##  Min.   : 871.0   Min.   :18.00   No :249   Non-Travel       : 24   
##  1st Qu.: 945.8   1st Qu.:29.00   Yes: 51   Travel_Frequently: 62   
##  Median :1020.5   Median :36.00             Travel_Rarely    :214   
##  Mean   :1020.5   Mean   :36.27                                     
##  3rd Qu.:1095.2   3rd Qu.:42.00                                     
##  Max.   :1170.0   Max.   :60.00                                     
##                                                                     
##    DailyRate                       Department  DistanceFromHome   Education    
##  Min.   : 105.0   Human Resources       : 17   Min.   : 1.00    Min.   :1.000  
##  1st Qu.: 429.2   Research & Development:190   1st Qu.: 2.00    1st Qu.:2.000  
##  Median : 693.0   Sales                 : 93   Median : 7.00    Median :3.000  
##  Mean   : 783.2                                Mean   : 8.70    Mean   :2.887  
##  3rd Qu.:1171.2                                3rd Qu.:11.25    3rd Qu.:4.000  
##  Max.   :1492.0                                Max.   :29.00    Max.   :5.000  
##                                                                                
##           EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction
##  Human Resources :  5    Min.   :1     Min.   :   7   Min.   :1.00           
##  Life Sciences   :118    1st Qu.:1     1st Qu.: 477   1st Qu.:2.00           
##  Marketing       : 32    Median :1     Median :1008   Median :3.00           
##  Medical         :100    Mean   :1     Mean   :1014   Mean   :2.77           
##  Other           : 18    3rd Qu.:1     3rd Qu.:1569   3rd Qu.:4.00           
##  Technical Degree: 27    Max.   :1     Max.   :2068   Max.   :4.00           
##                                                                              
##     Gender      HourlyRate     JobInvolvement     JobLevel
##  Female:129   Min.   : 30.00   Min.   :1.000   Min.   :1  
##  Male  :171   1st Qu.: 48.00   1st Qu.:2.000   1st Qu.:1  
##               Median : 66.00   Median :3.000   Median :2  
##               Mean   : 66.52   Mean   :2.737   Mean   :2  
##               3rd Qu.: 85.25   3rd Qu.:3.000   3rd Qu.:2  
##               Max.   :100.00   Max.   :4.000   Max.   :5  
##                                                           
##                       JobRole   JobSatisfaction  MaritalStatus  MonthlyRate   
##  Sales Executive          :69   Min.   :1.000   Divorced: 71   Min.   : 2122  
##  Research Scientist       :59   1st Qu.:2.000   Married :135   1st Qu.: 7778  
##  Laboratory Technician    :51   Median :3.000   Single  : 94   Median :13508  
##  Manufacturing Director   :27   Mean   :2.747                  Mean   :14091  
##  Healthcare Representative:26   3rd Qu.:4.000                  3rd Qu.:20464  
##  Manager                  :21   Max.   :4.000                  Max.   :26999  
##  (Other)                  :47                                                 
##  NumCompaniesWorked Over18  OverTime  PercentSalaryHike PerformanceRating
##  Min.   :0.00       Y:300   No :224   Min.   :11.00     Min.   :3.00     
##  1st Qu.:1.00               Yes: 76   1st Qu.:12.75     1st Qu.:3.00     
##  Median :2.00                         Median :14.00     Median :3.00     
##  Mean   :2.74                         Mean   :15.28     Mean   :3.16     
##  3rd Qu.:4.00                         3rd Qu.:18.00     3rd Qu.:3.00     
##  Max.   :9.00                         Max.   :25.00     Max.   :4.00     
##                                                                          
##  RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears
##  Min.   :1.000            Min.   :80    Min.   :0.0000   Min.   : 0.00    
##  1st Qu.:2.000            1st Qu.:80    1st Qu.:0.0000   1st Qu.: 6.00    
##  Median :3.000            Median :80    Median :1.0000   Median : 9.00    
##  Mean   :2.637            Mean   :80    Mean   :0.8333   Mean   :10.78    
##  3rd Qu.:4.000            3rd Qu.:80    3rd Qu.:1.0000   3rd Qu.:14.00    
##  Max.   :4.000            Max.   :80    Max.   :3.0000   Max.   :40.00    
##                                                                           
##  TrainingTimesLastYear WorkLifeBalance YearsAtCompany   YearsInCurrentRole
##  Min.   :0.00          Min.   :1.000   Min.   : 0.000   Min.   : 0.0      
##  1st Qu.:2.00          1st Qu.:2.000   1st Qu.: 3.000   1st Qu.: 2.0      
##  Median :3.00          Median :3.000   Median : 5.000   Median : 3.0      
##  Mean   :2.82          Mean   :2.717   Mean   : 6.623   Mean   : 4.2      
##  3rd Qu.:3.00          3rd Qu.:3.000   3rd Qu.: 9.000   3rd Qu.: 7.0      
##  Max.   :6.00          Max.   :4.000   Max.   :33.000   Max.   :16.0      
##                                                                           
##  YearsSinceLastPromotion YearsWithCurrManager
##  Min.   : 0.00           Min.   : 0.000      
##  1st Qu.: 0.00           1st Qu.: 2.000      
##  Median : 1.00           Median : 3.000      
##  Mean   : 2.14           Mean   : 3.817      
##  3rd Qu.: 3.00           3rd Qu.: 7.000      
##  Max.   :15.00           Max.   :15.000      
## 
str(no_salary_data)
## 'data.frame':    300 obs. of  35 variables:
##  $ ID                      : int  871 872 873 874 875 876 877 878 879 880 ...
##  $ Age                     : int  43 33 55 36 27 39 33 21 30 51 ...
##  $ Attrition               : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 2 1 2 1 1 ...
##  $ BusinessTravel          : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 2 3 3 1 3 3 1 2 2 3 ...
##  $ DailyRate               : int  1422 461 267 1351 1302 895 750 251 1312 1405 ...
##  $ Department              : Factor w/ 3 levels "Human Resources",..: 3 2 3 2 2 3 3 2 2 2 ...
##  $ DistanceFromHome        : int  2 13 13 9 19 5 22 10 23 11 ...
##  $ Education               : int  4 1 4 4 3 3 2 2 3 2 ...
##  $ EducationField          : Factor w/ 6 levels "Human Resources",..: 2 2 3 2 5 6 3 2 2 6 ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  1849 995 1372 1949 1619 42 160 1279 159 1367 ...
##  $ EnvironmentSatisfaction : int  1 2 1 1 4 4 3 1 1 4 ...
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 2 1 2 2 2 2 2 1 2 1 ...
##  $ HourlyRate              : int  92 53 85 66 67 56 95 45 96 82 ...
##  $ JobInvolvement          : int  3 3 4 4 2 3 3 2 1 2 ...
##  $ JobLevel                : int  2 1 4 1 1 2 2 1 1 4 ...
##  $ JobRole                 : Factor w/ 9 levels "Healthcare Representative",..: 8 7 8 3 3 9 8 3 7 5 ...
##  $ JobSatisfaction         : int  4 4 3 2 1 4 2 3 3 2 ...
##  $ MaritalStatus           : Factor w/ 3 levels "Divorced","Married",..: 2 3 3 2 1 2 2 3 1 3 ...
##  $ MonthlyRate             : int  19246 17241 9277 9238 16290 3335 15480 25308 22310 24439 ...
##  $ NumCompaniesWorked      : int  1 3 6 1 1 3 0 1 1 3 ...
##  $ Over18                  : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ OverTime                : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 1 1 1 1 1 ...
##  $ PercentSalaryHike       : int  20 18 17 22 11 14 13 20 25 16 ...
##  $ PerformanceRating       : int  4 3 3 4 3 3 3 4 4 3 ...
##  $ RelationshipSatisfaction: int  3 1 3 2 1 3 1 3 3 2 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  1 0 0 0 2 1 1 0 3 0 ...
##  $ TotalWorkingYears       : int  7 5 24 5 7 19 8 2 10 29 ...
##  $ TrainingTimesLastYear   : int  5 4 2 3 3 6 2 2 2 1 ...
##  $ WorkLifeBalance         : int  3 3 2 3 3 4 4 1 2 2 ...
##  $ YearsAtCompany          : int  7 3 19 5 7 1 7 2 10 5 ...
##  $ YearsInCurrentRole      : int  7 2 7 4 7 0 7 2 7 2 ...
##  $ YearsSinceLastPromotion : int  7 0 3 0 0 0 0 2 0 0 ...
##  $ YearsWithCurrManager    : int  7 2 8 2 7 0 7 2 9 3 ...
# Drop columns that doesnt make sense to the analysis.
# Employee count (always 1), Standard hours (always 80) and over 18 (always 18) variables are constant.

drops <- c("EmployeeCount", "EmployeeNumber", "Over18", "StandardHours")
raw_data <- raw_data[ , !(names(raw_data) %in% drops)]

Data Imputation

# From the documentation we have reference values for a columns JobInvolvement, JobSatisfaction, PerformanceRating, RelationshipSatisfaction, WorkLifeBalance.
# Lets replace those values with actual values.
# We will also be removing a few columns from our raw data which we wont be using in analysis further.
imp_data <- raw_data %>%
  mutate(JobInvolvement = as.factor(if_else(JobInvolvement == 1,"Low",if_else(JobInvolvement == 2, "Medium",if_else(JobInvolvement == 3, "High", "Very High")))),  JobSatisfaction = as.factor(if_else(JobSatisfaction == 1, "Low",if_else(JobSatisfaction == 2, "Medium",if_else(JobSatisfaction == 3, "High","Very High")))),  PerformanceRating = as.factor(if_else(PerformanceRating == 1, "Low",if_else(PerformanceRating == 2, "Good", if_else(PerformanceRating == 3, "Excellent", "Outstanding")))),RelationshipSatisfaction = as.factor(if_else(RelationshipSatisfaction == 1, "Low",if_else(RelationshipSatisfaction == 2, "Medium", if_else(RelationshipSatisfaction == 3, "High", "Very High")))),WorkLifeBalance = as.factor(if_else(WorkLifeBalance == 1, "Bad",if_else(WorkLifeBalance == 2, "Good", if_else(WorkLifeBalance == 3, "Better", "Best"))))) 

imp_no_attrition <- no_attrition_data %>%
  mutate(JobInvolvement = as.factor(if_else(JobInvolvement == 1,"Low",if_else(JobInvolvement == 2, "Medium",if_else(JobInvolvement == 3, "High", "Very High")))),  JobSatisfaction = as.factor(if_else(JobSatisfaction == 1, "Low",if_else(JobSatisfaction == 2, "Medium",if_else(JobSatisfaction == 3, "High","Very High")))),  PerformanceRating = as.factor(if_else(PerformanceRating == 1, "Low",if_else(PerformanceRating == 2, "Good", if_else(PerformanceRating == 3, "Excellent", "Outstanding")))),RelationshipSatisfaction = as.factor(if_else(RelationshipSatisfaction == 1, "Low",if_else(RelationshipSatisfaction == 2, "Medium", if_else(RelationshipSatisfaction == 3, "High", "Very High")))),WorkLifeBalance = as.factor(if_else(WorkLifeBalance == 1, "Bad",if_else(WorkLifeBalance == 2, "Good", if_else(WorkLifeBalance == 3, "Better", "Best"))))) 

imp_no_salary <- no_salary_data %>%
  mutate(JobInvolvement = as.factor(if_else(JobInvolvement == 1,"Low",if_else(JobInvolvement == 2, "Medium",if_else(JobInvolvement == 3, "High", "Very High")))),  JobSatisfaction = as.factor(if_else(JobSatisfaction == 1, "Low",if_else(JobSatisfaction == 2, "Medium",if_else(JobSatisfaction == 3, "High","Very High")))),  PerformanceRating = as.factor(if_else(PerformanceRating == 1, "Low",if_else(PerformanceRating == 2, "Good", if_else(PerformanceRating == 3, "Excellent", "Outstanding")))),RelationshipSatisfaction = as.factor(if_else(RelationshipSatisfaction == 1, "Low",if_else(RelationshipSatisfaction == 2, "Medium", if_else(RelationshipSatisfaction == 3, "High", "Very High")))),WorkLifeBalance = as.factor(if_else(WorkLifeBalance == 1, "Bad",if_else(WorkLifeBalance == 2, "Good", if_else(WorkLifeBalance == 3, "Better", "Best"))))) 

classification_data  <- imp_data

summary(imp_data)
##        ID             Age        Attrition           BusinessTravel
##  Min.   :  1.0   Min.   :18.00   No :730   Non-Travel       : 94   
##  1st Qu.:218.2   1st Qu.:30.00   Yes:140   Travel_Frequently:158   
##  Median :435.5   Median :35.00             Travel_Rarely    :618   
##  Mean   :435.5   Mean   :36.83                                     
##  3rd Qu.:652.8   3rd Qu.:43.00                                     
##  Max.   :870.0   Max.   :60.00                                     
##                                                                    
##    DailyRate                       Department  DistanceFromHome   Education    
##  Min.   : 103.0   Human Resources       : 35   Min.   : 1.000   Min.   :1.000  
##  1st Qu.: 472.5   Research & Development:562   1st Qu.: 2.000   1st Qu.:2.000  
##  Median : 817.5   Sales                 :273   Median : 7.000   Median :3.000  
##  Mean   : 815.2                                Mean   : 9.339   Mean   :2.901  
##  3rd Qu.:1165.8                                3rd Qu.:14.000   3rd Qu.:4.000  
##  Max.   :1499.0                                Max.   :29.000   Max.   :5.000  
##                                                                                
##           EducationField EnvironmentSatisfaction    Gender      HourlyRate    
##  Human Resources : 15    Min.   :1.000           Female:354   Min.   : 30.00  
##  Life Sciences   :358    1st Qu.:2.000           Male  :516   1st Qu.: 48.00  
##  Marketing       :100    Median :3.000                        Median : 66.00  
##  Medical         :270    Mean   :2.701                        Mean   : 65.61  
##  Other           : 52    3rd Qu.:4.000                        3rd Qu.: 83.00  
##  Technical Degree: 75    Max.   :4.000                        Max.   :100.00  
##                                                                               
##    JobInvolvement    JobLevel                          JobRole   
##  High     :514    Min.   :1.000   Sales Executive          :200  
##  Low      : 47    1st Qu.:1.000   Research Scientist       :172  
##  Medium   :228    Median :2.000   Laboratory Technician    :153  
##  Very High: 81    Mean   :2.039   Manufacturing Director   : 87  
##                   3rd Qu.:3.000   Healthcare Representative: 76  
##                   Max.   :5.000   Sales Representative     : 53  
##                                   (Other)                  :129  
##   JobSatisfaction  MaritalStatus MonthlyIncome    MonthlyRate   
##  High     :254    Divorced:191   Min.   : 1081   Min.   : 2094  
##  Low      :179    Married :410   1st Qu.: 2840   1st Qu.: 8092  
##  Medium   :166    Single  :269   Median : 4946   Median :14074  
##  Very High:271                   Mean   : 6390   Mean   :14326  
##                                  3rd Qu.: 8182   3rd Qu.:20456  
##                                  Max.   :19999   Max.   :26997  
##                                                                 
##  NumCompaniesWorked OverTime  PercentSalaryHike   PerformanceRating
##  Min.   :0.000      No :618   Min.   :11.0      Excellent  :738    
##  1st Qu.:1.000      Yes:252   1st Qu.:12.0      Outstanding:132    
##  Median :2.000                Median :14.0                         
##  Mean   :2.728                Mean   :15.2                         
##  3rd Qu.:4.000                3rd Qu.:18.0                         
##  Max.   :9.000                Max.   :25.0                         
##                                                                    
##  RelationshipSatisfaction StockOptionLevel TotalWorkingYears
##  High     :261            Min.   :0.0000   Min.   : 0.00    
##  Low      :174            1st Qu.:0.0000   1st Qu.: 6.00    
##  Medium   :171            Median :1.0000   Median :10.00    
##  Very High:264            Mean   :0.7839   Mean   :11.05    
##                           3rd Qu.:1.0000   3rd Qu.:15.00    
##                           Max.   :3.0000   Max.   :40.00    
##                                                             
##  TrainingTimesLastYear WorkLifeBalance YearsAtCompany   YearsInCurrentRole
##  Min.   :0.000         Bad   : 48      Min.   : 0.000   Min.   : 0.000    
##  1st Qu.:2.000         Best  : 98      1st Qu.: 3.000   1st Qu.: 2.000    
##  Median :3.000         Better:532      Median : 5.000   Median : 3.000    
##  Mean   :2.832         Good  :192      Mean   : 6.962   Mean   : 4.205    
##  3rd Qu.:3.000                         3rd Qu.:10.000   3rd Qu.: 7.000    
##  Max.   :6.000                         Max.   :40.000   Max.   :18.000    
##                                                                           
##  YearsSinceLastPromotion YearsWithCurrManager
##  Min.   : 0.000          Min.   : 0.00       
##  1st Qu.: 0.000          1st Qu.: 2.00       
##  Median : 1.000          Median : 3.00       
##  Mean   : 2.169          Mean   : 4.14       
##  3rd Qu.: 3.000          3rd Qu.: 7.00       
##  Max.   :15.000          Max.   :17.00       
## 
# Separate continuous and categorical variables for analysis
imp_data_conti <- imp_data[, !sapply(imp_data, is.factor)]
imp_data_categ <- imp_data[, sapply(imp_data, is.factor)]

EDA - Correlation in Predictors

# Computing the p value of correlations
cor.mtest <- function(mat, ...) {
    mat <- as.matrix(mat)
    n <- ncol(mat)
    p.mat<- matrix(NA, n, n)
    diag(p.mat) <- 0
    for (i in 1:(n - 1)) {
        for (j in (i + 1):n) {
            tmp <- cor.test(mat[, i], mat[, j], ...)
            p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
        }
    }
  colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
  p.mat
}

p.mat <- cor.mtest(imp_data_conti)
correlation <- cor(imp_data_conti)
# Correlation plot with significance level of 0.05
corrplot::corrplot(correlation, type="upper", order="hclust", title ="correlation", p.mat = p.mat, sig.level = 0.05)

cat_cor<- GKtauDataframe(imp_data_categ)
plot(cat_cor, corrColors = "blue")

EDA - Correlation Analysis. Below variables are correlated to each other 1) YearsSinceLastPromotion -> YearsWithCurrManager, YearsInCurrentRole, YearsAtCompany, TotalWorkingYears 2) YearsAtCompany -> JobLevel, MonthlyIncome 3) Age -> TotalWorkingYears, JobLevel, MonthlyIncome 4) TotalWorkingYears -> JobLevel, MonthlyIncome 5) Department -> JobRole

EDA - Histogram, Scatterplot, Boxplots

# Histogram of each variable
# Scatter Plot
pairs(imp_data_conti, pch=19)

# Box Plot
boxplot(imp_data_conti)

boxplot(imp_data_conti$MonthlyIncome, main="Boxplot Monthly Income")

EDA - Observations - Individual Features

# Employee Personal Demographics - Numerical Variables
p1 <- ggplot(imp_data) + geom_histogram(aes(Age), binwidth = 5, fill = "light green",col = "black")
p2 <- ggplot(imp_data) + geom_histogram(aes(DistanceFromHome), binwidth = 5, fill = "light green",col = "black")
p3 <- ggplot(imp_data) + geom_histogram(aes(NumCompaniesWorked), binwidth = 2, fill = "light green",col = "black")
p4 <- ggplot(imp_data) + geom_histogram(aes(TotalWorkingYears), binwidth = 4, fill = "light green",col = "black")

grid.arrange(p1, p2, p3, p4, ncol = 2, nrow = 2)

# Employee Billing Rate Demographics - Numerical Variables
p1 <- ggplot(imp_data) + geom_histogram(aes(HourlyRate), binwidth = 5, fill = "light green",col = "black")
p2 <- ggplot(imp_data) + geom_histogram(aes(DailyRate), binwidth = 100, fill = "light green",col = "black")
p3 <- ggplot(imp_data) + geom_histogram(aes(MonthlyRate), binwidth = 1000, fill = "light green",col = "black")

grid.arrange(p1, p2, p3, nrow = 3)

#Observations :
#There seems to be no clear cut pattern observed in these rates.

#Employee Work Demographics - Numerical Variables
p1 <- ggplot(imp_data) + geom_histogram(aes(Age), binwidth = 5, fill = "light green",col = "black")
p2 <- ggplot(imp_data) + geom_histogram(aes(PercentSalaryHike), binwidth = 1, fill = "light green",col = "black")
p3 <- ggplot(imp_data) + geom_histogram(aes(YearsAtCompany), binwidth = 2, fill = "light green",col = "black")
p4 <- ggplot(imp_data) + geom_histogram(aes(YearsInCurrentRole), binwidth = 2, fill = "light green",col = "black")
p5 <- ggplot(imp_data) + geom_histogram(aes(YearsSinceLastPromotion), binwidth = 2, fill = "light green",col = "black")
p6 <- ggplot(imp_data) + geom_histogram(aes(YearsWithCurrManager), binwidth = 2, fill = "light green",col = "black")
p7 <- ggplot(imp_data) + geom_histogram(aes(MonthlyIncome), binwidth = 1000, fill = "light green",col = "black")
p8 <- ggplot(imp_data) + geom_histogram(aes(DistanceFromHome), binwidth = 5, fill = "light green",col = "black")
p9 <- ggplot(imp_data) + geom_histogram(aes(NumCompaniesWorked), binwidth = 2, fill = "light green",col = "black")

grid.arrange(p1, p2, p3, p4, p5, p6,p7,p8,p9 ,nrow = 3, ncol = 3)

#Observations:
#All 6 variables are right skewed 

#Employee Personal Demographics - Categorical Variables
p1<- imp_data %>%
  dplyr::group_by(Gender) %>%
  dplyr::summarise(counts = n()) %>%
  ggplot(aes(x = as.factor(Gender), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Gender") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25) + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 900))

p2<- imp_data %>%
  dplyr::group_by(Education) %>%
  dplyr::summarise(counts = n()) %>%
  ggplot(aes(x = as.factor(Education), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Education") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25) + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 650))

p3 <- imp_data %>%
  dplyr::group_by(EducationField) %>%
  dplyr::summarise(counts = n()) %>%
  ggplot(aes(x = as.factor(EducationField), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Education Field") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25) + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 650))

p4 <- imp_data %>%
  dplyr::group_by(MaritalStatus) %>%
  dplyr::summarise(counts = n()) %>%
  ggplot(aes(x = as.factor(MaritalStatus), y = counts)) + geom_bar(stat = 'identity', fill = "light green")+ ggtitle("Marital Status") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25) + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 750))

p5 <- imp_data %>%
  dplyr::group_by(RelationshipSatisfaction) %>%
  dplyr::summarise(counts = n()) %>%
  ggplot(aes(x = as.factor(RelationshipSatisfaction), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Relationship Satisfaction") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25) + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())+ scale_y_continuous(limits = c(0, 500))

p6 <- imp_data %>%
  dplyr::group_by(WorkLifeBalance) %>%
  dplyr::summarise(counts = n()) %>%
  ggplot(aes(x = as.factor(WorkLifeBalance), y = counts)) + geom_bar(stat = 'identity', fill = "light green")+ ggtitle("Work Life Balance") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25) + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 950))

grid.arrange(p1, p2, p3, p4, p5, p6, nrow = 2, ncol = 3)

#Employee Work Demographics - Categorical Variables
p1 <- imp_data %>%
  dplyr::group_by(BusinessTravel) %>%
  dplyr::summarise(counts = n()) %>%
  ggplot(aes(x = as.factor(BusinessTravel), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Business Travel") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ theme(plot.title = element_text(size =10),axis.text.x = element_text(size =10,angle = 45, hjust = 1),axis.title.x=element_blank())+ scale_y_continuous(limits = c(0, 1100))



p2 <- imp_data %>%
  dplyr::group_by(EnvironmentSatisfaction) %>%
  dplyr::summarise(counts = n()) %>%
  ggplot(aes(x = as.factor(EnvironmentSatisfaction), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Environment Satisfaction") + geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25) + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =10,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 500))

p3 <- imp_data %>%
  dplyr::group_by(JobInvolvement) %>%
  dplyr::summarise(counts = n()) %>%
  ggplot(aes(x = as.factor(JobInvolvement), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Job Involvement") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ theme(plot.title = element_text(size =10),axis.text.x = element_text(size =10,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 900))


p4 <- imp_data %>%
  dplyr::group_by(JobSatisfaction) %>%
  dplyr::summarise(counts = n()) %>%
  ggplot(aes(x = as.factor(JobSatisfaction), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Job Satisfaction") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25) + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 500))

p5 <- imp_data %>%
  dplyr::group_by(OverTime) %>%
  dplyr::summarise(counts = n()) %>%
  ggplot(aes(x = as.factor(OverTime), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Over Time") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 1100))


p6 <- imp_data %>%
  dplyr::group_by(PerformanceRating) %>%
  dplyr::summarise(counts = n()) %>%
  ggplot(aes(x = as.factor(PerformanceRating), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Performance Rating") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 1300))

grid.arrange(p1,p2,p3,p4,p5,p6,nrow = 2)

p1 <- imp_data %>%
  dplyr::group_by(Department) %>%
  dplyr::summarise(counts = n()) %>%
  ggplot(aes(x = as.factor(Department), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Department") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ theme(plot.title = element_text(size =10),axis.text.x = element_text(size = 7, angle = 45, hjust = 1),axis.title.x=element_blank())

p2 <- imp_data %>%
  dplyr::group_by(JobRole) %>%
  dplyr::summarise(counts = n()) %>%
  ggplot(aes(x = as.factor(JobRole), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Job Role") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25) + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())

grid.arrange(p1,p2 ,ncol = 2)

p1 <- imp_data %>%
  dplyr::group_by(WorkLifeBalance) %>%
  dplyr::summarise(counts = n()) %>%
  ggplot(aes(x = as.factor(WorkLifeBalance), y = counts)) + geom_bar(stat = 'identity', fill = "light green")+ ggtitle("Work Life Balance") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25) + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 800))
  
p2 <- imp_data %>%
  dplyr::group_by(BusinessTravel) %>%
  dplyr::summarise(counts = n()) %>%
  ggplot(aes(x = as.factor(BusinessTravel), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Business Travel") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ theme(plot.title = element_text(size =10),axis.text.x = element_text(size =10,angle = 45, hjust = 1),axis.title.x=element_blank())+ scale_y_continuous(limits = c(0, 800))


p3 <- imp_data %>%
  dplyr::group_by(JobInvolvement) %>%
  dplyr::summarise(counts = n()) %>%
  ggplot(aes(x = as.factor(JobInvolvement), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Job Involvement") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ theme(plot.title = element_text(size =10),axis.text.x = element_text(size =10,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 800))
  

p4 <- imp_data %>%
  dplyr::group_by(OverTime) %>%
  dplyr::summarise(counts = n()) %>%
  ggplot(aes(x = as.factor(OverTime), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Over Time") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 800))


p5 <- imp_data %>%
  dplyr::group_by(PerformanceRating) %>%
  dplyr::summarise(counts = n()) %>%
  ggplot(aes(x = as.factor(PerformanceRating), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Performance Rating") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 800))


p6 <- imp_data %>%
  dplyr::group_by(Department) %>%
  dplyr::summarise(counts = n()) %>%
  ggplot(aes(x = as.factor(Department), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Department") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ theme(plot.title = element_text(size =10),axis.text.x = element_text(size = 7, angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 800))

  grid.arrange(p1,p2,p3,p4,p5,p6,nrow = 2)

EDA - Observations - Univariate Analysis 1) MonthlyIncome has outliers. 2) Right Skewed data - JobLevel, MonthlyIncome, PercentageSalaryHike, TotalWorkingYears, YearsAtCompany, YearsSinceLastPromotion 3) More than 50% employees are male. 4) Almost 70% of the employees are from Life Science and Medical Background. 5) 47% of the employees are married where as 22% are divorced. 6) More than 60% of the employees feel they have a better work life balance. 7) More than 70% of the employees Travel rarely for work. 8) More than 70% of the people seem to be working over time

EDA - Observations - Multivariate Analysis

#After looking at every feature individually, let's now do some bivariate/multivariate analysis. Here we'll explore the independent variables with respect to the target variable. The objective is to discover hidden relationships between the independent variables and the target variable.
#Employee Personal Demographics - Numerical Variables
p1 <- imp_data %>%
  ggplot(aes(x = Age, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Age") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())

p2 <- imp_data %>%
  ggplot(aes(x = DistanceFromHome, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Distance From Home")  + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())

p3 <- imp_data %>%
  ggplot(aes(x = NumCompaniesWorked, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Number of Companies")  + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())

p4 <- imp_data %>%
  ggplot(aes(x = TotalWorkingYears, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Total Working Years")  + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())

grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)

#Employee Billing Rate Demographics - Numerical Variables
p1 <- imp_data %>%
  ggplot(aes(x = HourlyRate, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Hourly Rate") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())


p2 <- imp_data %>%
  ggplot(aes(x = DailyRate, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Daily Rate") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())


p3 <- imp_data %>%
  ggplot(aes(x = MonthlyRate, fill = Attrition)) + geom_density(alpha = 0.5)+ ggtitle("Monthly Rate") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())


grid.arrange(p1, p2, p3)

# Employee Work Demographics - Numerical Variables
p1 <- imp_data %>%
  ggplot(aes(x = MonthlyIncome, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Monthly Income") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())


p2 <- imp_data %>%
  ggplot(aes(x = PercentSalaryHike, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Percentage Salary Hike") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())


p3 <- imp_data %>%
  ggplot(aes(x = YearsAtCompany, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Years At Company") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())


p4 <- imp_data %>%
  ggplot(aes(x = YearsInCurrentRole, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Years in Current Role") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())


p5 <- imp_data %>%
  ggplot(aes(x = YearsSinceLastPromotion, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Years Since Last Promotion") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())


p6 <- imp_data %>%
  ggplot(aes(x = YearsWithCurrManager, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Years With Current Manager") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())


grid.arrange(p1, p2, p3, p4, p5, p6 , nrow = 3, ncol = 2)

p1 <- imp_data %>%
  ggplot(aes(x = Age, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Age") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())

p2 <- imp_data %>%
  ggplot(aes(x = DistanceFromHome, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Distance From Home")  + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())


p3 <- imp_data %>%
  ggplot(aes(x = TotalWorkingYears, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Total Working Years")  + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())

p4 <- imp_data %>%
  ggplot(aes(x = MonthlyIncome, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Monthly Income") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())
  
grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)

#Employee Personal Demographics - Categorical Variables
p1 <- imp_data %>%
  dplyr::group_by(Gender) %>%
  dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
  ggplot(aes(x = Gender, y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Gender") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ scale_y_continuous(limits = c(0, 20))


p2 <- imp_data %>%
  dplyr::group_by(Education) %>%
  dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
  ggplot(aes(x = Education, y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Education") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ scale_y_continuous(limits = c(0, 20))

p3 <- imp_data %>%
  dplyr::group_by(EducationField) %>%
  dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
  ggplot(aes(x = EducationField, y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Education Field") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ scale_y_continuous(limits = c(0, 30))

p4 <- imp_data %>%
  dplyr::group_by(MaritalStatus) %>%
  dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
  ggplot(aes(x = MaritalStatus, y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Marital Status") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ scale_y_continuous(limits = c(0, 40))

p5 <- imp_data %>%
  dplyr::group_by(RelationshipSatisfaction) %>%
  dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
  ggplot(aes(x = as.factor(RelationshipSatisfaction), y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Relationship Satisfaction") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ scale_y_continuous(limits = c(0, 40))

p6 <- imp_data %>%
  dplyr::group_by(WorkLifeBalance) %>%
  dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
  ggplot(aes(x = as.factor(WorkLifeBalance), y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Work Life Balance") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())  + geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.15)+ scale_y_continuous(limits = c(0, 40))


grid.arrange(p1, p2, p3, p4, p5, p6, nrow = 2, ncol = 3)

#Employee Work Demographics - Categorical Variables
p1 <- imp_data %>%
  dplyr::group_by(BusinessTravel) %>%
  dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
  ggplot(aes(x = BusinessTravel, y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Business Travel") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ scale_y_continuous(limits = c(0, 30))

p2 <- imp_data %>%
  dplyr::group_by(EnvironmentSatisfaction) %>%
  dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
  ggplot(aes(x = EnvironmentSatisfaction, y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Environment Satisfaction") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ scale_y_continuous(limits = c(0, 30))

p3 <- imp_data %>%
  dplyr::group_by(JobInvolvement) %>%
  dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
  ggplot(aes(x = JobInvolvement, y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Job Involvement") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ scale_y_continuous(limits = c(0, 40))

p4 <- imp_data %>%
  dplyr::group_by(JobSatisfaction) %>%
  dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
  ggplot(aes(x = JobSatisfaction, y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Job Satisfaction") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ scale_y_continuous(limits = c(0, 30))

p5 <- imp_data %>%
  dplyr::group_by(OverTime) %>%
  dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
  ggplot(aes(x = OverTime, y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Over Time") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ scale_y_continuous(limits = c(0, 35))

p6 <- imp_data %>%
  dplyr::group_by(PerformanceRating) %>%
  dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
  ggplot(aes(x = as.factor(PerformanceRating), y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Performance Rating") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ scale_y_continuous(limits = c(0, 20))

grid.arrange(p1, p2, p3, p4, p5, p6, nrow = 2, ncol = 3)
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).

p1 <- imp_data %>%
  dplyr::group_by(Department) %>%
  dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
  ggplot(aes(x = Department, y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Department") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)

p2 <- imp_data %>%
  dplyr::group_by(JobRole) %>%
  dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
  ggplot(aes(x = JobRole, y = attrition_rate))+ geom_bar(stat = 'identity',fill = "Orange") + ggtitle("Job Role - Attrition Rate") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)

p2

grid.arrange(p1, p2, ncol = 2)

EDA - Observations - Bi/Multivariate Analysis 1) Younger employees within 25-35 years have a higher attrition rate. 2) We see a lower attrition rate when the Distance from home is within 10 units. The attrition rate increase post 10 units. 3) The attrition rate tends to be higher with employees who have worked with 5 to 7 companies. 4) We observe a peak in attrition rate at a monthly income approx. 2500. 5) We also see a peak in attrition rate when the employee is with the company for 0-2 years approx. 6) #Attrition rate is very high among employees from HR, Marketing and Technical backgrounds. 7) As expected, the attrition rate is very high among employees who have a bad work life balance. 8) Attrition rate is higher among people who travel frequently. 9) Its also higher among employees who have a low environment satisfaction, low job involvement and low job satisfaction. 10) The attrition rate is almost 30% among employees who work over time. 11) Sales department have the highest attrition at 21% whereas Sales Representatives have the highest attrition at 45%.

Data splitting - Classification

set.seed(1234)
splitPerc = .70
trainIndices = sample(1:dim(classification_data)[1],round(splitPerc * dim(classification_data)[1]))
train = classification_data[trainIndices,]
test = classification_data[-trainIndices,]

# 50-50 SMOTE down sampling
train_down_sampled <- SMOTE(form = Attrition~.,data = train, k = 10, perc.over = 100)


# ROSE Down sampling
train_bal_r <- ROSE(Attrition ~ ., data = train)$data


train_sampled <- train_down_sampled

Basic Naive bayes model

#Split train and test model
set.seed(1234)
model.naive <- naiveBayes(Attrition ~ ., data=train_sampled[,-c(1)])
attrition.pred.naive <- predict(model.naive, test[,-c(1)], type="class")
confusionMatrix(attrition.pred.naive, test$Attrition)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  130  11
##        Yes  83  37
##                                           
##                Accuracy : 0.6398          
##                  95% CI : (0.5784, 0.6981)
##     No Information Rate : 0.8161          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2411          
##                                           
##  Mcnemar's Test P-Value : 2.423e-13       
##                                           
##             Sensitivity : 0.6103          
##             Specificity : 0.7708          
##          Pos Pred Value : 0.9220          
##          Neg Pred Value : 0.3083          
##              Prevalence : 0.8161          
##          Detection Rate : 0.4981          
##    Detection Prevalence : 0.5402          
##       Balanced Accuracy : 0.6906          
##                                           
##        'Positive' Class : No              
## 

Train model with Naive bayes method and get importance of predictors

# Prepare training scheme
control <- trainControl(method="repeatedcv", number=10, repeats=3, classProbs = TRUE)
set.seed(1234)
# Train the model
model.naive.t <- train( Attrition ~ ., data=train_sampled[, -c(1)], method="naive_bayes", trControl=control)
attrition.pred.naive.t <- predict(model.naive.t, test[-c(1)])
confusionMatrix(attrition.pred.naive.t, test$Attrition)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  124  13
##        Yes  89  35
##                                           
##                Accuracy : 0.6092          
##                  95% CI : (0.5471, 0.6688)
##     No Information Rate : 0.8161          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.193           
##                                           
##  Mcnemar's Test P-Value : 1.118e-13       
##                                           
##             Sensitivity : 0.5822          
##             Specificity : 0.7292          
##          Pos Pred Value : 0.9051          
##          Neg Pred Value : 0.2823          
##              Prevalence : 0.8161          
##          Detection Rate : 0.4751          
##    Detection Prevalence : 0.5249          
##       Balanced Accuracy : 0.6557          
##                                           
##        'Positive' Class : No              
## 
# Top 10 Glmnet predictor ranking
importance.naive.t <- varImp(model.naive.t, scale=FALSE)
rank.naive.t <- importance.naive.t$importance
write.csv(rank.naive.t, "rank.naive.t.csv")
rank.naive.t <- read.csv("rank.naive.t.csv", header=TRUE)
colnames(rank.naive.t) <- c("Predictors", "Importance")
rank.naive.t <- rank.naive.t[order(rank.naive.t$Importance, decreasing = TRUE),]
ggplot(rank.naive.t[1:20,], aes(x=reorder(Predictors, Importance),y=Importance)) + geom_bar(stat = "identity") + coord_flip() + labs(title="Importance of Predictors", x="Predictors", y="Importance") +theme(axis.text.x=element_text(hjust=0.5, vjust=0.5, size = 12))+theme(axis.text.y=element_text(size = 12))

Custom Naive Bayes model1

set.seed(1234)
nb_model1 <- Attrition ~ OverTime + MaritalStatus + StockOptionLevel + +MonthlyIncome + TotalWorkingYears + JobLevel + YearsAtCompany + YearsInCurrentRole
model.naive.ct <- naiveBayes(nb_model1, data=train_sampled)
attrition.pred.naive.ct <- predict(model.naive.ct, test, type="class")
confusionMatrix(attrition.pred.naive.ct, test$Attrition)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  115  11
##        Yes  98  37
##                                         
##                Accuracy : 0.5824        
##                  95% CI : (0.52, 0.6429)
##     No Information Rate : 0.8161        
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0.1826        
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.5399        
##             Specificity : 0.7708        
##          Pos Pred Value : 0.9127        
##          Neg Pred Value : 0.2741        
##              Prevalence : 0.8161        
##          Detection Rate : 0.4406        
##    Detection Prevalence : 0.4828        
##       Balanced Accuracy : 0.6554        
##                                         
##        'Positive' Class : No            
## 

Custom Naive Bayes model2

set.seed(1234)

nb_model2 <- Attrition ~ OverTime + MaritalStatus + MonthlyIncome + TotalWorkingYears + JobLevel + YearsAtCompany + JobSatisfaction  + DistanceFromHome + NumCompaniesWorked + WorkLifeBalance + JobRole + StockOptionLevel 
model.naive.ct <- naiveBayes(nb_model2, data=train_sampled)
attrition.pred.naive.ct <- predict(model.naive.ct, test, type="class")
confusionMatrix(attrition.pred.naive.ct, test$Attrition)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  129  10
##        Yes  84  38
##                                           
##                Accuracy : 0.6398          
##                  95% CI : (0.5784, 0.6981)
##     No Information Rate : 0.8161          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2488          
##                                           
##  Mcnemar's Test P-Value : 5.098e-14       
##                                           
##             Sensitivity : 0.6056          
##             Specificity : 0.7917          
##          Pos Pred Value : 0.9281          
##          Neg Pred Value : 0.3115          
##              Prevalence : 0.8161          
##          Detection Rate : 0.4943          
##    Detection Prevalence : 0.5326          
##       Balanced Accuracy : 0.6987          
##                                           
##        'Positive' Class : No              
## 

Custom Naive Bayes model3

set.seed(1234)
 Attrition ~ OverTime + MonthlyIncome + MaritalStatus + StockOptionLevel  + JobSatisfaction + DistanceFromHome + YearsWithCurrManager
## Attrition ~ OverTime + MonthlyIncome + MaritalStatus + StockOptionLevel + 
##     JobSatisfaction + DistanceFromHome + YearsWithCurrManager
nb_model3 <- Attrition ~ OverTime + MonthlyIncome + MaritalStatus + StockOptionLevel  + JobSatisfaction + DistanceFromHome + YearsWithCurrManager 

model.naive.ct <- naiveBayes(nb_model3, data=train_sampled)
attrition.pred.naive.ct <- predict(model.naive.ct, test, type="class")
confusionMatrix(attrition.pred.naive.ct, test$Attrition)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  146  14
##        Yes  67  34
##                                           
##                Accuracy : 0.6897          
##                  95% CI : (0.6297, 0.7453)
##     No Information Rate : 0.8161          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2758          
##                                           
##  Mcnemar's Test P-Value : 7.569e-09       
##                                           
##             Sensitivity : 0.6854          
##             Specificity : 0.7083          
##          Pos Pred Value : 0.9125          
##          Neg Pred Value : 0.3366          
##              Prevalence : 0.8161          
##          Detection Rate : 0.5594          
##    Detection Prevalence : 0.6130          
##       Balanced Accuracy : 0.6969          
##                                           
##        'Positive' Class : No              
## 

From the above Naive Bayes Models, Model 3 with predictors (OverTime + MonthlyIncome + MaritalStatus + StockOptionLevel + JobSatisfaction + DistanceFromHome + YearsWithCurrManager) gave better sensitiviy(69%) and specificity(72%)

Train model with knn and get importance of predictors

#Prepare training scheme
control <- trainControl(method="repeatedcv", number=10, repeats=3)
set.seed(1234)
#Train the model
model.knn <- train( Attrition ~ ., data=train_sampled, method="knn", trControl=control)
attrition.pred.knn <- predict(model.knn, test)
confusionMatrix(attrition.pred.knn, test$Attrition)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  124  21
##        Yes  89  27
##                                           
##                Accuracy : 0.5785          
##                  95% CI : (0.5161, 0.6392)
##     No Information Rate : 0.8161          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0934          
##                                           
##  Mcnemar's Test P-Value : 1.679e-10       
##                                           
##             Sensitivity : 0.5822          
##             Specificity : 0.5625          
##          Pos Pred Value : 0.8552          
##          Neg Pred Value : 0.2328          
##              Prevalence : 0.8161          
##          Detection Rate : 0.4751          
##    Detection Prevalence : 0.5556          
##       Balanced Accuracy : 0.5723          
##                                           
##        'Positive' Class : No              
## 
#Top 10 predictor ranking
importance.knn <- varImp(model.knn, scale=FALSE)
rank.knn <- importance.knn$importance
write.csv(rank.knn, "rank.knn.csv")
rank.knn <- read.csv("rank.knn.csv", header=TRUE)
colnames(rank.knn) <- c("Predictors", "Importance")
rank.knn <- rank.knn[order(rank.knn$Importance, decreasing = TRUE),]
ggplot(rank.knn[1:20,], aes(x=reorder(Predictors, Importance),y=Importance)) + geom_bar(stat = "identity") + coord_flip() + labs(title="Importance of Predictors", x="Predictors", y="Importance") +theme(axis.text.x=element_text(hjust=0.5, vjust=0.5, size = 12))+theme(axis.text.y=element_text(size = 12))

Custom knn model1

#Create training and test data
set.seed(1234)
knn_model1 <- Attrition ~ OverTime + MaritalStatus + StockOptionLevel + +MonthlyIncome + TotalWorkingYears + JobLevel + YearsAtCompany + YearsInCurrentRole

# k-Nearest Neighbours
fit_knn <- train(knn_model1,train_sampled,method = 'knn',trControl = trainControl(method = 'repeatedcv',number = 3))
fit_knn
## k-Nearest Neighbors 
## 
## 368 samples
##   8 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 1 times) 
## Summary of sample sizes: 246, 246, 244 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa    
##   5  0.5572008  0.1144016
##   7  0.5653534  0.1307069
##   9  0.5601534  0.1203067
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 7.
Predictions_knn <- predict(fit_knn,test)

#Set the training control method
trainMethod <- trainControl(method = "repeatedcv", number =  25, repeats = 5, summaryFunction = twoClassSummary, classProbs = TRUE)

fit.knn <- train(knn_model1, data = train_sampled, method = "knn", metric = "Spec", trControl = trainMethod, preProcess = c("center","scale"), tuneLength = 31)
pred.knn <- predict(fit.knn, test)
summary(pred.knn)
##  No Yes 
## 163  98
confusionMatrix(pred.knn, test$Attrition)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  145  18
##        Yes  68  30
##                                           
##                Accuracy : 0.6705          
##                  95% CI : (0.6099, 0.7272)
##     No Information Rate : 0.8161          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2179          
##                                           
##  Mcnemar's Test P-Value : 1.265e-07       
##                                           
##             Sensitivity : 0.6808          
##             Specificity : 0.6250          
##          Pos Pred Value : 0.8896          
##          Neg Pred Value : 0.3061          
##              Prevalence : 0.8161          
##          Detection Rate : 0.5556          
##    Detection Prevalence : 0.6245          
##       Balanced Accuracy : 0.6529          
##                                           
##        'Positive' Class : No              
## 

Custom knn model2

#Create training and test data
set.seed(1234)
knn_model2 <- Attrition ~ OverTime + MaritalStatus + MonthlyIncome + TotalWorkingYears + JobLevel + YearsAtCompany + JobSatisfaction  + DistanceFromHome + NumCompaniesWorked + WorkLifeBalance + JobRole + StockOptionLevel 

# k-Nearest Neighbours
fit_knn <- train(knn_model2,train_sampled,method = 'knn',trControl = trainControl(method = 'repeatedcv',number = 3))
fit_knn
## k-Nearest Neighbors 
## 
## 368 samples
##  12 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 1 times) 
## Summary of sample sizes: 246, 246, 244 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa    
##   5  0.5627093  0.1254186
##   7  0.5653534  0.1307069
##   9  0.5601534  0.1203067
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 7.
Predictions_knn <- predict(fit_knn,test)

#Set the training control method
trainMethod <- trainControl(method = "repeatedcv", number =  25, repeats = 5, summaryFunction = twoClassSummary, classProbs = TRUE)

fit.knn <- train(knn_model2, data = train_sampled, method = "knn", metric = "Spec", trControl = trainMethod, preProcess = c("center","scale"), tuneLength = 31)
pred.knn <- predict(fit.knn, test)
summary(pred.knn)
##  No Yes 
## 178  83
confusionMatrix(pred.knn, test$Attrition)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  160  18
##        Yes  53  30
##                                          
##                Accuracy : 0.728          
##                  95% CI : (0.6697, 0.781)
##     No Information Rate : 0.8161         
##     P-Value [Acc > NIR] : 0.9998         
##                                          
##                   Kappa : 0.2933         
##                                          
##  Mcnemar's Test P-Value : 5.459e-05      
##                                          
##             Sensitivity : 0.7512         
##             Specificity : 0.6250         
##          Pos Pred Value : 0.8989         
##          Neg Pred Value : 0.3614         
##              Prevalence : 0.8161         
##          Detection Rate : 0.6130         
##    Detection Prevalence : 0.6820         
##       Balanced Accuracy : 0.6881         
##                                          
##        'Positive' Class : No             
## 

Custom knn model3

#Create training and test data
set.seed(1234)
knn_model3 <- Attrition ~ OverTime + MonthlyIncome + MaritalStatus + StockOptionLevel  + JobSatisfaction + DistanceFromHome + YearsWithCurrManager 

# k-Nearest Neighbours
fit_knn <- train(knn_model3,train_sampled,method = 'knn',trControl = trainControl(method = 'repeatedcv',number = 3))
fit_knn
## k-Nearest Neighbors 
## 
## 368 samples
##   7 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 1 times) 
## Summary of sample sizes: 246, 246, 244 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa    
##   5  0.5653975  0.1307950
##   7  0.5653534  0.1307069
##   9  0.5601534  0.1203067
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
Predictions_knn <- predict(fit_knn,test)

#Set the training control method
trainMethod <- trainControl(method = "repeatedcv", number =  25, repeats = 5, summaryFunction = twoClassSummary, classProbs = TRUE)

fit.knn <- train(knn_model3, data = train_sampled, method = "knn", metric = "Spec", trControl = trainMethod, preProcess = c("center","scale"), tuneLength = 31)
pred.knn <- predict(fit.knn, test)
summary(pred.knn)
##  No Yes 
## 180  81
confusionMatrix(pred.knn, test$Attrition)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  161  19
##        Yes  52  29
##                                          
##                Accuracy : 0.728          
##                  95% CI : (0.6697, 0.781)
##     No Information Rate : 0.8161         
##     P-Value [Acc > NIR] : 0.999825       
##                                          
##                   Kappa : 0.2843         
##                                          
##  Mcnemar's Test P-Value : 0.000146       
##                                          
##             Sensitivity : 0.7559         
##             Specificity : 0.6042         
##          Pos Pred Value : 0.8944         
##          Neg Pred Value : 0.3580         
##              Prevalence : 0.8161         
##          Detection Rate : 0.6169         
##    Detection Prevalence : 0.6897         
##       Balanced Accuracy : 0.6800         
##                                          
##        'Positive' Class : No             
## 

From the above KNN, Model 1 with predictors (OverTime + MaritalStatus + StockOptionLevel + +MonthlyIncome + TotalWorkingYears + JobLevel + YearsAtCompany + YearsInCurrentRole) gave better sensitivity(67%) and specificity(64%).

But the Naive Bayes Model3 has better sensitivity(69%) and specificity(72%). So for further predictions we will use Naive Bayes Model 3.

Naive Bayes Model for Predicting Attrition

set.seed(1234)

nb_model3 <- Attrition ~ OverTime + MonthlyIncome + MaritalStatus + StockOptionLevel  + JobSatisfaction + DistanceFromHome + YearsWithCurrManager 
test <- no_attrition_data
model.naive.ct <- naiveBayes(nb_model3, data=raw_data)
attrition.pred.naive.ct <- predict(model.naive.ct, test, type="class")
attrition_prediction <- data.frame(ID=test$ID,Attrition=attrition.pred.naive.ct)
path_out = '/Users/lijjumathew/Code/Github/MSDS-Doing-Datascience-Project2/Results/'
fileName = paste(path_out, 'Case2PredictionsMathewAttrition..csv',sep = '')
#write.csv(attrition_prediction, fileName, row.names = F)

Linear Regression data prep

#Create training and test data
set.seed(1234)
# Converting Character Variable into Numeric Variables

conv.attr.reg <- function(data, ...) {
data$AttritionNum[data$Attrition== "Yes"] <-1
data$AttritionNum[data$Attrition== "No"] <-0

data$BusinessTravelNum[data$BusinessTravel== "Non-Travel"] <-1
data$BusinessTravelNum[data$BusinessTravel== "Travel_Rarely"] <-2
data$BusinessTravelNum[data$BusinessTravel== "Travel_Frequently"] <-3

data$DepartmentNum[data$Department== "Human Resources"] <-1
data$DepartmentNum[data$Department== "Research & Development"] <-2
data$DepartmentNum[data$Department== "Sales"] <-3

data$EducationFieldNum[data$EducationField== "Human Resources"] <-1
data$EducationFieldNum[data$EducationField== "Life Sciences"] <-2
data$EducationFieldNum[data$EducationField== "Marketing"] <-3
data$EducationFieldNum[data$EducationField== "Medical"] <-4
data$EducationFieldNum[data$EducationField== "Other"] <-5
data$EducationFieldNum[data$EducationField== "Technical Degree"] <-6

data$GenderNum[data$Gender== "Female"] <-1
data$GenderNum[data$Gender== "Male"] <-2

data$JobRoleNum[data$JobRole== "Human Resources"] <-1
data$JobRoleNum[data$JobRole== "Healthcare Representative"] <-2
data$JobRoleNum[data$JobRole== "Laboratory Technician"] <-3
data$JobRoleNum[data$JobRole== "Manager"] <-4
data$JobRoleNum[data$JobRole== "Manufacturing Director"] <-5
data$JobRoleNum[data$JobRole== "Research Director"] <-6
data$JobRoleNum[data$JobRole== "Research Scientist"] <-7
data$JobRoleNum[data$JobRole== "Sales Executive"] <-8
data$JobRoleNum[data$JobRole== "Sales Representative"] <-9

data$MaritalStatusNum[data$MaritalStatus== "Divorced"] <-1
data$MaritalStatusNum[data$MaritalStatus== "Married"] <-2
data$MaritalStatusNum[data$MaritalStatus== "Single"] <-3

data$OverTimeNum[data$OverTime== "Yes"] <-1
data$OverTimeNum[data$OverTime== "No"] <-0

data$JobInvolvementNum[data$JobInvolvement== "Low"] <-1
data$JobInvolvementNum[data$JobInvolvement== "Medium"] <-2
data$JobInvolvementNum[data$JobInvolvement== "High"] <-3
data$JobInvolvementNum[data$JobInvolvement== "Very High"] <-4

data$JobSatisfactionNum[data$JobSatisfaction== "Low"] <-1
data$JobSatisfactionNum[data$JobSatisfaction== "Medium"] <-2
data$JobSatisfactionNum[data$JobSatisfaction== "High"] <-3
data$JobSatisfactionNum[data$JobSatisfaction== "Very High"] <-4

data$PerformanceRatingNum[data$PerformanceRating== "Low"] <-1
data$PerformanceRatingNum[data$PerformanceRating== "Good"] <-2
data$PerformanceRatingNum[data$PerformanceRating== "Excellent"] <-3
data$PerformanceRatingNum[data$PerformanceRating== "Outstanding"] <-4

data$RelationshipSatisfactionNum[data$RelationshipSatisfaction== "Low"] <-1
data$RelationshipSatisfactionNum[data$RelationshipSatisfaction== "Medium"] <-2
data$RelationshipSatisfactionNum[data$RelationshipSatisfaction== "High"] <-3
data$RelationshipSatisfactionNum[data$RelationshipSatisfaction== "Very High"] <-4

data$WorkLifeBalanceNum[data$WorkLifeBalance== "Bad"] <-1
data$WorkLifeBalanceNum[data$WorkLifeBalance== "Good"] <-2
data$WorkLifeBalanceNum[data$WorkLifeBalance== "Better"] <-3
data$WorkLifeBalanceNum[data$WorkLifeBalance== "Best"] <-4

drops <- c("Attrition", "BusinessTravel", "Department","EducationField","Gender","JobRole","MaritalStatus","OverTime","JobInvolvement", "JobSatisfaction", "PerformanceRating", "RelationshipSatisfaction","WorkLifeBalance")
data <- data[ , !(names(data) %in% drops)]

data
}

Identifying Top 3 factors - Correlation Test

corr_data <- conv.attr.reg(classification_data)

cor.test(corr_data[,"AttritionNum"], corr_data[,"BusinessTravelNum"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "BusinessTravelNum"]
## t = 2.376, df = 868, p-value = 0.01772
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.01399612 0.14607234
## sample estimates:
##        cor 
## 0.08038707
cor.test(corr_data[,"AttritionNum"], corr_data[,"DailyRate"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "DailyRate"]
## t = -0.99618, df = 868, p-value = 0.3194
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.10003417  0.03274616
## sample estimates:
##         cor 
## -0.03379312
cor.test(corr_data[,"AttritionNum"], corr_data[,"DistanceFromHome"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "DistanceFromHome"]
## t = 2.577, df = 868, p-value = 0.01013
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.02079097 0.15271755
## sample estimates:
##        cor 
## 0.08713629
cor.test(corr_data[,"AttritionNum"], corr_data[,"EducationFieldNum"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "EducationFieldNum"]
## t = 0.77046, df = 868, p-value = 0.4412
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.04039377  0.09244725
## sample estimates:
##        cor 
## 0.02614215
cor.test(corr_data[,"AttritionNum"], corr_data[,"GenderNum"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "GenderNum"]
## t = 0.74416, df = 868, p-value = 0.457
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.04128468  0.09156240
## sample estimates:
##        cor 
## 0.02525034
cor.test(corr_data[,"AttritionNum"], corr_data[,"HourlyRate"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "HourlyRate"]
## t = 1.0777, df = 868, p-value = 0.2815
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.0299844  0.1027702
## sample estimates:
##        cor 
## 0.03655418
cor.test(corr_data[,"AttritionNum"], corr_data[,"MaritalStatusNum"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "MaritalStatusNum"]
## t = 5.9205, df = 868, p-value = 4.621e-09
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1322814 0.2600751
## sample estimates:
##      cor 
## 0.197015
cor.test(corr_data[,"AttritionNum"], corr_data[,"MonthlyIncome"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "MonthlyIncome"]
## t = -4.6199, df = 868, p-value = 4.422e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.21912446 -0.08936942
## sample estimates:
##       cor 
## -0.154915
cor.test(corr_data[,"AttritionNum"], corr_data[,"MonthlyRate"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "MonthlyRate"]
## t = -1.2749, df = 868, p-value = 0.2027
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.10938360  0.02330051
## sample estimates:
##         cor 
## -0.04323217
cor.test(corr_data[,"AttritionNum"], corr_data[,"NumCompaniesWorked"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "NumCompaniesWorked"]
## t = 1.8011, df = 868, p-value = 0.07204
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.005469025  0.126969672
## sample estimates:
##        cor 
## 0.06101889
cor.test(corr_data[,"AttritionNum"], corr_data[,"OverTimeNum"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "OverTimeNum"]
## t = 8.3288, df = 868, p-value = 3.164e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2093563 0.3324905
## sample estimates:
##       cor 
## 0.2720366
cor.test(corr_data[,"AttritionNum"], corr_data[,"PercentSalaryHike"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "PercentSalaryHike"]
## t = 0.45165, df = 868, p-value = 0.6516
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.05118981  0.08171055
## sample estimates:
##        cor 
## 0.01532807
cor.test(corr_data[,"AttritionNum"], corr_data[,"TotalWorkingYears"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "TotalWorkingYears"]
## t = -4.9965, df = 868, p-value = 7.059e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2311035 -0.1018725
## sample estimates:
##        cor 
## -0.1672061
cor.test(corr_data[,"AttritionNum"], corr_data[,"YearsAtCompany"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "YearsAtCompany"]
## t = -3.8252, df = 868, p-value = 0.00014
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.19356333 -0.06282598
## sample estimates:
##        cor 
## -0.1287541
cor.test(corr_data[,"AttritionNum"], corr_data[,"YearsInCurrentRole"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "YearsInCurrentRole"]
## t = -4.6596, df = 868, p-value = 3.665e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.22039310 -0.09069163
## sample estimates:
##        cor 
## -0.1562157
cor.test(corr_data[,"AttritionNum"], corr_data[,"YearsSinceLastPromotion"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "YearsSinceLastPromotion"]
## t = -0.13474, df = 868, p-value = 0.8928
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.07101747  0.06191123
## sample estimates:
##          cor 
## -0.004573321
cor.test(corr_data[,"AttritionNum"], corr_data[,"YearsWithCurrManager"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "YearsWithCurrManager"]
## t = -4.3718, df = 868, p-value = 1.381e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2111876 -0.0811078
## sample estimates:
##        cor 
## -0.1467822
cor.test(corr_data[,"AttritionNum"], corr_data[,"YearsWithCurrManager"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "YearsWithCurrManager"]
## t = -4.3718, df = 868, p-value = 1.381e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2111876 -0.0811078
## sample estimates:
##        cor 
## -0.1467822
cor.test(corr_data[,"AttritionNum"], corr_data[,"JobRoleNum"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "JobRoleNum"]
## t = 2.5546, df = 868, p-value = 0.0108
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.02003487 0.15197869
## sample estimates:
##        cor 
## 0.08638557
cor.test(corr_data[,"AttritionNum"], corr_data[,"DepartmentNum"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "DepartmentNum"]
## t = 2.5739, df = 868, p-value = 0.01022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.02068592 0.15261490
## sample estimates:
##        cor 
## 0.08703199
cor.test(corr_data[,"AttritionNum"], corr_data[,"JobLevel"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "JobLevel"]
## t = -4.8409, df = 868, p-value = 1.53e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.22616491 -0.09671294
## sample estimates:
##        cor 
## -0.1621364
cor.test(corr_data[,"AttritionNum"], corr_data[,"JobRoleNum"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "JobRoleNum"]
## t = 2.5546, df = 868, p-value = 0.0108
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.02003487 0.15197869
## sample estimates:
##        cor 
## 0.08638557
cor.test(corr_data[,"AttritionNum"], corr_data[,"JobInvolvementNum"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "JobInvolvementNum"]
## t = -5.633, df = 868, p-value = 2.393e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2511246 -0.1228612
## sample estimates:
##        cor 
## -0.1877934
cor.test(corr_data[,"AttritionNum"], corr_data[,"JobSatisfactionNum"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "JobSatisfactionNum"]
## t = -3.1862, df = 868, p-value = 0.001493
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.17275210 -0.04135071
## sample estimates:
##        cor 
## -0.1075209
cor.test(corr_data[,"AttritionNum"], corr_data[,"PerformanceRatingNum"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "PerformanceRatingNum"]
## t = 0.45182, df = 868, p-value = 0.6515
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.05118406  0.08171629
## sample estimates:
##        cor 
## 0.01533384
cor.test(corr_data[,"AttritionNum"], corr_data[,"RelationshipSatisfactionNum"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "RelationshipSatisfactionNum"]
## t = -1.169, df = 868, p-value = 0.2427
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.10583346  0.02688998
## sample estimates:
##         cor 
## -0.03964661
cor.test(corr_data[,"AttritionNum"], corr_data[,"WorkLifeBalanceNum"])
## 
##  Pearson's product-moment correlation
## 
## data:  corr_data[, "AttritionNum"] and corr_data[, "WorkLifeBalanceNum"]
## t = -2.6561, df = 868, p-value = 0.00805
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.15532845 -0.02346401
## sample estimates:
##         cor 
## -0.08978971

Linear Regression - Full Model Attrition

train_sampled_reg <- conv.attr.reg(train_sampled)
full_model <-AttritionNum~.
fit1 <- lm(full_model, data = train_sampled_reg)
summary(fit1)
## 
## Call:
## lm(formula = full_model, data = train_sampled_reg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.85806 -0.29074  0.01326  0.26600  0.99204 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  1.024e+00  3.452e-01   2.966  0.00323 ** 
## ID                           1.024e-04  9.317e-05   1.099  0.27256    
## Age                         -8.879e-03  3.313e-03  -2.680  0.00772 ** 
## DailyRate                   -4.430e-05  6.014e-05  -0.737  0.46185    
## DistanceFromHome             5.619e-03  2.769e-03   2.029  0.04325 *  
## Education                    1.966e-02  2.318e-02   0.848  0.39690    
## EnvironmentSatisfaction     -4.086e-02  1.985e-02  -2.058  0.04036 *  
## HourlyRate                   2.947e-06  1.138e-03   0.003  0.99794    
## JobLevel                    -9.843e-03  7.594e-02  -0.130  0.89695    
## MonthlyIncome               -7.731e-06  1.748e-05  -0.442  0.65866    
## MonthlyRate                 -2.318e-06  3.243e-06  -0.715  0.47523    
## NumCompaniesWorked           1.814e-02  9.926e-03   1.827  0.06853 .  
## PercentSalaryHike            3.602e-03  8.068e-03   0.446  0.65558    
## StockOptionLevel            -8.701e-02  3.468e-02  -2.509  0.01259 *  
## TotalWorkingYears           -4.856e-03  6.539e-03  -0.743  0.45819    
## TrainingTimesLastYear       -5.759e-02  1.884e-02  -3.057  0.00241 ** 
## YearsAtCompany               9.942e-03  7.958e-03   1.249  0.21239    
## YearsInCurrentRole          -7.901e-03  1.218e-02  -0.649  0.51703    
## YearsSinceLastPromotion      8.550e-03  9.092e-03   0.940  0.34768    
## YearsWithCurrManager        -1.281e-02  1.177e-02  -1.088  0.27734    
## BusinessTravelNum            9.043e-02  3.820e-02   2.367  0.01849 *  
## DepartmentNum                1.198e-01  5.128e-02   2.336  0.02007 *  
## EducationFieldNum            1.714e-02  1.618e-02   1.059  0.29040    
## GenderNum                    2.841e-02  4.366e-02   0.651  0.51570    
## JobRoleNum                  -4.013e-03  1.091e-02  -0.368  0.71320    
## MaritalStatusNum             4.514e-02  3.825e-02   1.180  0.23876    
## OverTimeNum                  2.998e-01  4.514e-02   6.642 1.25e-10 ***
## JobInvolvementNum           -1.266e-01  2.762e-02  -4.585 6.42e-06 ***
## JobSatisfactionNum          -3.829e-02  1.986e-02  -1.928  0.05468 .  
## PerformanceRatingNum        -3.884e-02  8.050e-02  -0.483  0.62976    
## RelationshipSatisfactionNum -6.298e-03  1.889e-02  -0.333  0.73903    
## WorkLifeBalanceNum          -6.345e-02  3.049e-02  -2.081  0.03820 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3983 on 336 degrees of freedom
## Multiple R-squared:  0.4206, Adjusted R-squared:  0.3672 
## F-statistic:  7.87 on 31 and 336 DF,  p-value: < 2.2e-16

With Full Model following predictors turned to be statistically significant. Age, DistanceFromHome, EnvironmentSatisfaction, NumCompaniesWorked, StockOptionLevel, TrainingTimesLastYear, BusinessTravelNum, DepartmentNum, MaritalStatusNum, OverTimeNum, JobInvolvementNum, JobSatisfactionNum, WorkLifeBalanceNum

Linear Regression -Reduced Model1 Attrition

Model1 <-AttritionNum~ Age + DistanceFromHome + EnvironmentSatisfaction + NumCompaniesWorked + StockOptionLevel + TrainingTimesLastYear + BusinessTravelNum + DepartmentNum + MaritalStatusNum + OverTimeNum + JobInvolvementNum + JobSatisfactionNum + WorkLifeBalanceNum
fit1 <- lm(Model1, data = train_sampled_reg)
summary(fit1)
## 
## Call:
## lm(formula = Model1, data = train_sampled_reg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.84122 -0.28684 -0.01888  0.26245  1.08348 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              1.122969   0.225203   4.986 9.65e-07 ***
## Age                     -0.012333   0.002450  -5.034 7.68e-07 ***
## DistanceFromHome         0.006133   0.002712   2.261  0.02435 *  
## EnvironmentSatisfaction -0.035423   0.019012  -1.863  0.06327 .  
## NumCompaniesWorked       0.013191   0.008369   1.576  0.11586    
## StockOptionLevel        -0.080171   0.033269  -2.410  0.01647 *  
## TrainingTimesLastYear   -0.047790   0.017976  -2.659  0.00820 ** 
## BusinessTravelNum        0.089535   0.037360   2.397  0.01707 *  
## DepartmentNum            0.108082   0.041434   2.609  0.00948 ** 
## MaritalStatusNum         0.045473   0.037300   1.219  0.22361    
## OverTimeNum              0.304671   0.043731   6.967 1.59e-11 ***
## JobInvolvementNum       -0.133775   0.026956  -4.963 1.08e-06 ***
## JobSatisfactionNum      -0.045894   0.018983  -2.418  0.01613 *  
## WorkLifeBalanceNum      -0.071976   0.029626  -2.430  0.01562 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3977 on 354 degrees of freedom
## Multiple R-squared:  0.3914, Adjusted R-squared:  0.3691 
## F-statistic: 17.51 on 13 and 354 DF,  p-value: < 2.2e-16

Conclusion

The factors which increases attrition rate in employees are: (because of their positive coefficient in Model) 1) More Business Travel 2) Office Distance From Home 3) Worked in more number of companies 4) OverTime to complete work 5) More number in present company 6) If he is not promoted from long time.

The factors which decreases attrition rate in employees are: (because of their negative coefficient in Model) 1) Higher Age of Employee 2) Good Environment Satisfaction 3) Good Job Involvement 4) Good Job Satisfaction 5) Good Relationship Satisfaction 6) Good Work Life Balance 7) More number of years in Current Role 8) More number of years with Current Manager.

The factors which adversely affect Attrition Rate means to very high extent are: 1) More Business Travel 2) Office Distance From Home 3) Worked in more number of companies 4) Overtime at work 5) Good Environment Satisfaction 6) Good Job Involvement 7) Good Job Satisfaction

Linear Regression -Full Model Salary Prediction

set.seed(1234)
splitPerc = .70
trainIndices = sample(1:dim(classification_data)[1],round(splitPerc * dim(classification_data)[1]))
train = classification_data[trainIndices,]
test = classification_data[-trainIndices,]
reg_train <- conv.attr.reg(train)
reg_test <- conv.attr.reg(test)

Model1 <-MonthlyIncome ~ .
fit1 <- lm(Model1, data = reg_train)
summary(fit1)
## 
## Call:
## lm(formula = Model1, data = reg_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5703.7  -786.2     8.1   728.8  4583.0 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  2.970e+02  8.934e+02   0.332 0.739685    
## ID                          -3.058e-01  2.197e-01  -1.392 0.164581    
## Age                         -4.550e+00  8.432e+00  -0.540 0.589679    
## DailyRate                    1.049e-01  1.347e-01   0.779 0.436533    
## DistanceFromHome            -9.438e+00  6.605e+00  -1.429 0.153575    
## Education                   -3.473e+01  5.387e+01  -0.645 0.519411    
## EnvironmentSatisfaction     -7.812e+01  5.001e+01  -1.562 0.118825    
## HourlyRate                   2.485e+00  2.736e+00   0.908 0.364017    
## JobLevel                     3.855e+03  7.921e+01  48.663  < 2e-16 ***
## MonthlyRate                 -3.306e-03  7.861e-03  -0.421 0.674210    
## NumCompaniesWorked          -1.281e+01  2.525e+01  -0.507 0.612254    
## PercentSalaryHike            6.575e+01  2.350e+01   2.798 0.005318 ** 
## StockOptionLevel            -1.793e+02  8.400e+01  -2.134 0.033243 *  
## TotalWorkingYears            6.542e+01  1.597e+01   4.097 4.78e-05 ***
## TrainingTimesLastYear        3.740e+01  4.279e+01   0.874 0.382489    
## YearsAtCompany               8.253e+00  2.114e+01   0.390 0.696409    
## YearsInCurrentRole          -1.778e+01  2.649e+01  -0.671 0.502415    
## YearsSinceLastPromotion      2.117e+01  2.250e+01   0.941 0.347036    
## YearsWithCurrManager        -4.779e+01  2.602e+01  -1.837 0.066764 .  
## AttritionNum                 2.007e+02  1.731e+02   1.160 0.246598    
## BusinessTravelNum           -5.248e+01  1.016e+02  -0.516 0.605720    
## DepartmentNum               -9.042e+02  1.455e+02  -6.214 9.91e-10 ***
## EducationFieldNum           -3.453e+01  4.097e+01  -0.843 0.399711    
## GenderNum                    1.724e+02  1.100e+02   1.567 0.117768    
## JobRoleNum                   1.240e+02  3.297e+01   3.759 0.000188 ***
## MaritalStatusNum            -1.655e+02  1.000e+02  -1.655 0.098432 .  
## OverTimeNum                 -1.505e+02  1.270e+02  -1.185 0.236467    
## JobInvolvementNum            1.360e+02  7.699e+01   1.766 0.077947 .  
## JobSatisfactionNum           5.880e+01  4.972e+01   1.183 0.237462    
## PerformanceRatingNum        -4.984e+02  2.355e+02  -2.116 0.034769 *  
## RelationshipSatisfactionNum -2.748e+00  4.856e+01  -0.057 0.954884    
## WorkLifeBalanceNum          -1.807e+01  8.006e+01  -0.226 0.821527    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1313 on 577 degrees of freedom
## Multiple R-squared:  0.9264, Adjusted R-squared:  0.9224 
## F-statistic: 234.3 on 31 and 577 DF,  p-value: < 2.2e-16
lm.pred.full <- predict(fit1, newdata = reg_test[,names(reg_test) != "MonthlyIncome"])
actuals <- reg_test$MonthlyIncome
predictions <- lm.pred.full
RMSE <- sqrt(mean((actuals - predictions)^2))
RMSE
## [1] 1514.196

Linear Regression - Reduced Model1 Salary Prediction

red_model1_sal <- -MonthlyIncome ~ PerformanceRatingNum + JobInvolvementNum + MaritalStatusNum + JobRoleNum + DepartmentNum + YearsWithCurrManager + TotalWorkingYears + StockOptionLevel + PercentSalaryHike + JobLevel

fit1 <- lm(red_model1_sal, data = reg_train)
summary(fit1)
## 
## Call:
## lm(formula = red_model1_sal, data = reg_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4371.7  -736.8   -44.8   785.6  5340.8 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -138.45     661.04  -0.209 0.834177    
## PerformanceRatingNum   525.10     233.35   2.250 0.024791 *  
## JobInvolvementNum     -119.95      75.10  -1.597 0.110764    
## MaritalStatusNum       166.23      97.05   1.713 0.087257 .  
## JobRoleNum            -116.03      32.30  -3.592 0.000355 ***
## DepartmentNum          857.99     142.99   6.000 3.42e-09 ***
## YearsWithCurrManager    37.74      17.13   2.204 0.027937 *  
## TotalWorkingYears      -60.51      11.77  -5.141 3.71e-07 ***
## StockOptionLevel       192.72      82.44   2.338 0.019730 *  
## PercentSalaryHike      -65.89      23.23  -2.836 0.004721 ** 
## JobLevel             -3845.69      77.31 -49.744  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1311 on 598 degrees of freedom
## Multiple R-squared:  0.9239, Adjusted R-squared:  0.9227 
## F-statistic: 726.2 on 10 and 598 DF,  p-value: < 2.2e-16
lm.pred.full <- predict(fit1, newdata = reg_test[,names(reg_test) != "MonthlyIncome"])
actuals <- reg_test$MonthlyIncome
predictions <- lm.pred.full
RMSE <- sqrt(mean((actuals - predictions)^2))
RMSE
## [1] 14999.37

Linear Regression - Reduced Model2 Salary Prediction

red_model2_sal <- MonthlyIncome ~ JobRoleNum + 
  DepartmentNum + TotalWorkingYears + 
  PercentSalaryHike + JobLevel + 
  YearsWithCurrManager 

fit2 <- lm(red_model2_sal, data = reg_train)
summary(fit2)
## 
## Call:
## lm(formula = red_model2_sal, data = reg_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5432.3  -806.7     2.4   765.4  4314.4 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1036.19     346.50  -2.990 0.002899 ** 
## JobRoleNum             124.48      32.40   3.842 0.000135 ***
## DepartmentNum         -898.33     143.48  -6.261 7.27e-10 ***
## TotalWorkingYears       59.06      11.84   4.988 7.99e-07 ***
## PercentSalaryHike       25.29      14.45   1.751 0.080509 .  
## JobLevel              3861.90      77.60  49.766  < 2e-16 ***
## YearsWithCurrManager   -40.66      17.21  -2.363 0.018461 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1320 on 602 degrees of freedom
## Multiple R-squared:  0.9224, Adjusted R-squared:  0.9216 
## F-statistic:  1192 on 6 and 602 DF,  p-value: < 2.2e-16
lm.pred.full <- predict(fit2, newdata = reg_test[,names(reg_test) != "MonthlyIncome"])
actuals <- reg_test$MonthlyIncome
predictions <- lm.pred.full
RMSE <- sqrt(mean((actuals - predictions)^2))
RMSE
## [1] 1446.239

Linear Regression - Reduced Model3 Salary Prediction

red_model3_sal <- MonthlyIncome ~ JobRoleNum + DepartmentNum +  TotalWorkingYears + PercentSalaryHike + JobLevel + YearsAtCompany 

fit3 <- lm(red_model3_sal, data = reg_train)
summary(fit3)
## 
## Call:
## lm(formula = red_model3_sal, data = reg_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5318.1  -769.4     5.2   774.2  4334.8 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1043.57     347.82  -3.000  0.00281 ** 
## JobRoleNum          127.58      32.50   3.925 9.66e-05 ***
## DepartmentNum      -924.20     143.43  -6.443 2.39e-10 ***
## TotalWorkingYears    56.89      12.46   4.566 6.04e-06 ***
## PercentSalaryHike    25.00      14.50   1.724  0.08520 .  
## JobLevel           3862.84      78.00  49.524  < 2e-16 ***
## YearsAtCompany      -13.59      11.76  -1.156  0.24818    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1325 on 602 degrees of freedom
## Multiple R-squared:  0.9218, Adjusted R-squared:  0.921 
## F-statistic:  1183 on 6 and 602 DF,  p-value: < 2.2e-16
lm.pred.full <- predict(fit3, newdata = reg_test[,names(reg_test) != "MonthlyIncome"])
actuals <- reg_test$MonthlyIncome
predictions <- lm.pred.full
RMSE <- sqrt(mean((actuals - predictions)^2))
RMSE
## [1] 1453.472

Conclusion The Reduced Model2 has less RMSE. Lets uses that for salary prediction

Selecting best model by RMSE and using entire dataset for Salary Prediction

str(no_salary_data)
## 'data.frame':    300 obs. of  35 variables:
##  $ ID                      : int  871 872 873 874 875 876 877 878 879 880 ...
##  $ Age                     : int  43 33 55 36 27 39 33 21 30 51 ...
##  $ Attrition               : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 2 1 2 1 1 ...
##  $ BusinessTravel          : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 2 3 3 1 3 3 1 2 2 3 ...
##  $ DailyRate               : int  1422 461 267 1351 1302 895 750 251 1312 1405 ...
##  $ Department              : Factor w/ 3 levels "Human Resources",..: 3 2 3 2 2 3 3 2 2 2 ...
##  $ DistanceFromHome        : int  2 13 13 9 19 5 22 10 23 11 ...
##  $ Education               : int  4 1 4 4 3 3 2 2 3 2 ...
##  $ EducationField          : Factor w/ 6 levels "Human Resources",..: 2 2 3 2 5 6 3 2 2 6 ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  1849 995 1372 1949 1619 42 160 1279 159 1367 ...
##  $ EnvironmentSatisfaction : int  1 2 1 1 4 4 3 1 1 4 ...
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 2 1 2 2 2 2 2 1 2 1 ...
##  $ HourlyRate              : int  92 53 85 66 67 56 95 45 96 82 ...
##  $ JobInvolvement          : int  3 3 4 4 2 3 3 2 1 2 ...
##  $ JobLevel                : int  2 1 4 1 1 2 2 1 1 4 ...
##  $ JobRole                 : Factor w/ 9 levels "Healthcare Representative",..: 8 7 8 3 3 9 8 3 7 5 ...
##  $ JobSatisfaction         : int  4 4 3 2 1 4 2 3 3 2 ...
##  $ MaritalStatus           : Factor w/ 3 levels "Divorced","Married",..: 2 3 3 2 1 2 2 3 1 3 ...
##  $ MonthlyRate             : int  19246 17241 9277 9238 16290 3335 15480 25308 22310 24439 ...
##  $ NumCompaniesWorked      : int  1 3 6 1 1 3 0 1 1 3 ...
##  $ Over18                  : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ OverTime                : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 1 1 1 1 1 ...
##  $ PercentSalaryHike       : int  20 18 17 22 11 14 13 20 25 16 ...
##  $ PerformanceRating       : int  4 3 3 4 3 3 3 4 4 3 ...
##  $ RelationshipSatisfaction: int  3 1 3 2 1 3 1 3 3 2 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  1 0 0 0 2 1 1 0 3 0 ...
##  $ TotalWorkingYears       : int  7 5 24 5 7 19 8 2 10 29 ...
##  $ TrainingTimesLastYear   : int  5 4 2 3 3 6 2 2 2 1 ...
##  $ WorkLifeBalance         : int  3 3 2 3 3 4 4 1 2 2 ...
##  $ YearsAtCompany          : int  7 3 19 5 7 1 7 2 10 5 ...
##  $ YearsInCurrentRole      : int  7 2 7 4 7 0 7 2 7 2 ...
##  $ YearsSinceLastPromotion : int  7 0 3 0 0 0 0 2 0 0 ...
##  $ YearsWithCurrManager    : int  7 2 8 2 7 0 7 2 9 3 ...
pred_data <- conv.attr.reg(no_salary_data)
lm.model.reduced <- lm(red_model2_sal, data = reg_train)
lm.pred.salary <- predict(lm.model.reduced, newdata = pred_data)
no.salary.prediction <- data.frame(ID = pred_data$ID, MonthlyIncome = round(lm.pred.salary))
path_out2 = '/Users/lijjumathew/Code/Github/MSDS-Doing-Datascience-Project2/Results/'
fileName2 = paste(path_out, 'Case2PredictionsMathewSalary.csv',sep = '')
#write.csv(no.salary.prediction, fileName2, row.names = F)